No OneTemporary
Actions

Size

5 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/contrib/llvm-project/clang/include/clang/Driver/Options.td
	===================================================================
	--- head/contrib/llvm-project/clang/include/clang/Driver/Options.td (revision 362608)
	+++ head/contrib/llvm-project/clang/include/clang/Driver/Options.td (revision 362609)
	@@ -1,3403 +1,3411 @@
	//===--- Options.td - Options for clang -----------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the options accepted by clang.
	//
	//===----------------------------------------------------------------------===//

	// Include the common option parsing interfaces.
	include "llvm/Option/OptParser.td"

	/////////
	// Flags

	// DriverOption - The option is a "driver" option, and should not be forwarded
	// to other tools.
	def DriverOption : OptionFlag;

	// LinkerInput - The option is a linker input.
	def LinkerInput : OptionFlag;

	// NoArgumentUnused - Don't report argument unused warnings for this option; this
	// is useful for options like -static or -dynamic which a user may always end up
	// passing, even if the platform defaults to (or only supports) that option.
	def NoArgumentUnused : OptionFlag;

	// Unsupported - The option is unsupported, and the driver will reject command
	// lines that use it.
	def Unsupported : OptionFlag;

	// Ignored - The option is unsupported, and the driver will silently ignore it.
	def Ignored : OptionFlag;

	// CoreOption - This is considered a "core" Clang option, available in both
	// clang and clang-cl modes.
	def CoreOption : OptionFlag;

	// CLOption - This is a cl.exe compatibility option. Options with this flag
	// are made available when the driver is running in CL compatibility mode.
	def CLOption : OptionFlag;

	// CC1Option - This option should be accepted by clang -cc1.
	def CC1Option : OptionFlag;

	// CC1AsOption - This option should be accepted by clang -cc1as.
	def CC1AsOption : OptionFlag;

	// NoDriverOption - This option should not be accepted by the driver.
	def NoDriverOption : OptionFlag;

	// A short name to show in documentation. The name will be interpreted as rST.
	class DocName<string name> { string DocName = name; }

	// A brief description to show in documentation, interpreted as rST.
	class DocBrief<code descr> { code DocBrief = descr; }

	// Indicates that this group should be flattened into its parent when generating
	// documentation.
	class DocFlatten { bit DocFlatten = 1; }

	// Indicates that this warning is ignored, but accepted with a warning for
	// GCC compatibility.
	class IgnoredGCCCompat : Flags<[HelpHidden]> {}

	/////////
	// Groups

	def Action_Group : OptionGroup<"<action group>">, DocName<"Actions">,
	DocBrief<[{The action to perform on the input.}]>;

	// Meta-group for options which are only used for compilation,
	// and not linking etc.
	def CompileOnly_Group : OptionGroup<"<CompileOnly group>">,
	DocName<"Compilation flags">, DocBrief<[{
	Flags controlling the behavior of Clang during compilation. These flags have
	no effect during actions that do not perform compilation.}]>;

	def Preprocessor_Group : OptionGroup<"<Preprocessor group>">,
	Group<CompileOnly_Group>,
	DocName<"Preprocessor flags">, DocBrief<[{
	Flags controlling the behavior of the Clang preprocessor.}]>;

	def IncludePath_Group : OptionGroup<"<I/i group>">, Group<Preprocessor_Group>,
	DocName<"Include path management">,
	DocBrief<[{
	Flags controlling how ``#include``\s are resolved to files.}]>;

	def I_Group : OptionGroup<"<I group>">, Group<IncludePath_Group>, DocFlatten;
	def i_Group : OptionGroup<"<i group>">, Group<IncludePath_Group>, DocFlatten;
	def clang_i_Group : OptionGroup<"<clang i group>">, Group<i_Group>, DocFlatten;

	def M_Group : OptionGroup<"<M group>">, Group<Preprocessor_Group>,
	DocName<"Dependency file generation">, DocBrief<[{
	Flags controlling generation of a dependency file for ``make``-like build
	systems.}]>;

	def d_Group : OptionGroup<"<d group>">, Group<Preprocessor_Group>,
	DocName<"Dumping preprocessor state">, DocBrief<[{
	Flags allowing the state of the preprocessor to be dumped in various ways.}]>;

	def Diag_Group : OptionGroup<"<W/R group>">, Group<CompileOnly_Group>,
	DocName<"Diagnostic flags">, DocBrief<[{
	Flags controlling which warnings, errors, and remarks Clang will generate.
	See the :doc:`full list of warning and remark flags <DiagnosticsReference>`.}]>;

	def R_Group : OptionGroup<"<R group>">, Group<Diag_Group>, DocFlatten;
	def R_value_Group : OptionGroup<"<R (with value) group>">, Group<R_Group>,
	DocFlatten;
	def W_Group : OptionGroup<"<W group>">, Group<Diag_Group>, DocFlatten;
	def W_value_Group : OptionGroup<"<W (with value) group>">, Group<W_Group>,
	DocFlatten;

	def f_Group : OptionGroup<"<f group>">, Group<CompileOnly_Group>,
	DocName<"Target-independent compilation options">;

	def f_clang_Group : OptionGroup<"<f (clang-only) group>">,
	Group<CompileOnly_Group>, DocFlatten;
	def pedantic_Group : OptionGroup<"<pedantic group>">, Group<f_Group>,
	DocFlatten;
	def opencl_Group : OptionGroup<"<opencl group>">, Group<f_Group>,
	DocName<"OpenCL flags">;

	def m_Group : OptionGroup<"<m group>">, Group<CompileOnly_Group>,
	DocName<"Target-dependent compilation options">;

	// Feature groups - these take command line options that correspond directly to
	// target specific features and can be translated directly from command line
	// options.
	def m_aarch64_Features_Group : OptionGroup<"<aarch64 features group>">,
	Group<m_Group>, DocName<"AARCH64">;
	def m_amdgpu_Features_Group : OptionGroup<"<amdgpu features group>">,
	Group<m_Group>, DocName<"AMDGPU">;
	def m_arm_Features_Group : OptionGroup<"<arm features group>">,
	Group<m_Group>, DocName<"ARM">;
	def m_hexagon_Features_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	// The features added by this group will not be added to target features.
	// These are explicitly handled.
	def m_hexagon_Features_HVX_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	def m_mips_Features_Group : OptionGroup<"<mips features group>">,
	Group<m_Group>, DocName<"MIPS">;
	def m_ppc_Features_Group : OptionGroup<"<ppc features group>">,
	Group<m_Group>, DocName<"PowerPC">;
	def m_wasm_Features_Group : OptionGroup<"<wasm features group>">,
	Group<m_Group>, DocName<"WebAssembly">;
	def m_x86_Features_Group : OptionGroup<"<x86 features group>">,
	Group<m_Group>, Flags<[CoreOption]>, DocName<"X86">;
	def m_riscv_Features_Group : OptionGroup<"<riscv features group>">,
	Group<m_Group>, DocName<"RISCV">;

	def m_libc_Group : OptionGroup<"<m libc group>">, Group<m_mips_Features_Group>,
	Flags<[HelpHidden]>;

	def O_Group : OptionGroup<"<O group>">, Group<CompileOnly_Group>,
	DocName<"Optimization level">, DocBrief<[{
	Flags controlling how much optimization should be performed.}]>;

	def DebugInfo_Group : OptionGroup<"<g group>">, Group<CompileOnly_Group>,
	DocName<"Debug information generation">, DocBrief<[{
	Flags controlling how much and what kind of debug information should be
	generated.}]>;

	def g_Group : OptionGroup<"<g group>">, Group<DebugInfo_Group>,
	DocName<"Kind and level of debug information">;
	def gN_Group : OptionGroup<"<gN group>">, Group<g_Group>,
	DocName<"Debug level">;
	def ggdbN_Group : OptionGroup<"<ggdbN group>">, Group<gN_Group>, DocFlatten;
	def gTune_Group : OptionGroup<"<gTune group>">, Group<g_Group>,
	DocName<"Debugger to tune debug information for">;
	def g_flags_Group : OptionGroup<"<g flags group>">, Group<DebugInfo_Group>,
	DocName<"Debug information flags">;

	def StaticAnalyzer_Group : OptionGroup<"<Static analyzer group>">,
	DocName<"Static analyzer flags">, DocBrief<[{
	Flags controlling the behavior of the Clang Static Analyzer.}]>;

	// gfortran options that we recognize in the driver and pass along when
	// invoking GCC to compile Fortran code.
	def gfortran_Group : OptionGroup<"<gfortran group>">,
	DocName<"Fortran compilation flags">, DocBrief<[{
	Flags that will be passed onto the ``gfortran`` compiler when Clang is given
	a Fortran input.}]>;

	def Link_Group : OptionGroup<"<T/e/s/t/u group>">, DocName<"Linker flags">,
	DocBrief<[{Flags that are passed on to the linker}]>;
	def T_Group : OptionGroup<"<T group>">, Group<Link_Group>, DocFlatten;
	def u_Group : OptionGroup<"<u group>">, Group<Link_Group>, DocFlatten;

	def reserved_lib_Group : OptionGroup<"<reserved libs group>">,
	Flags<[Unsupported]>;

	// Temporary groups for clang options which we know we don't support,
	// but don't want to verbosely warn the user about.
	def clang_ignored_f_Group : OptionGroup<"<clang ignored f group>">,
	Group<f_Group>, Flags<[Ignored]>;
	def clang_ignored_m_Group : OptionGroup<"<clang ignored m group>">,
	Group<m_Group>, Flags<[Ignored]>;

	// Group for clang options in the process of deprecation.
	// Please include the version that deprecated the flag as comment to allow
	// easier garbage collection.
	def clang_ignored_legacy_options_Group : OptionGroup<"<clang legacy flags>">,
	Group<f_Group>, Flags<[Ignored]>;

	// Retired with clang-5.0
	def : Flag<["-"], "fslp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
	def : Flag<["-"], "fno-slp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;

	// Retired with clang-10.0. Previously controlled X86 MPX ISA.
	def mmpx : Flag<["-"], "mmpx">, Group<clang_ignored_legacy_options_Group>;
	def mno_mpx : Flag<["-"], "mno-mpx">, Group<clang_ignored_legacy_options_Group>;

	// Group that ignores all gcc optimizations that won't be implemented
	def clang_ignored_gcc_optimization_f_Group : OptionGroup<
	"<clang_ignored_gcc_optimization_f_Group>">, Group<f_Group>, Flags<[Ignored]>;

	/////////
	// Options

	// The internal option ID must be a valid C++ identifier and results in a
	// clang::driver::options::OPT_XX enum constant for XX.
	//
	// We want to unambiguously be able to refer to options from the driver source
	// code, for this reason the option name is mangled into an ID. This mangling
	// isn't guaranteed to have an inverse, but for practical purposes it does.
	//
	// The mangling scheme is to ignore the leading '-', and perform the following
	// substitutions:
	// _ => __
	// - => _
	// / => _SLASH
	// # => _HASH
	// ? => _QUESTION
	// , => _COMMA
	// = => _EQ
	// C++ => CXX
	// . => _

	// Developer Driver Options

	def internal_Group : OptionGroup<"<clang internal options>">, Flags<[HelpHidden]>;
	def internal_driver_Group : OptionGroup<"<clang driver internal options>">,
	Group<internal_Group>, HelpText<"DRIVER OPTIONS">;
	def internal_debug_Group :
	OptionGroup<"<clang debug/development internal options>">,
	Group<internal_Group>, HelpText<"DEBUG/DEVELOPMENT OPTIONS">;

	class InternalDriverOpt : Group<internal_driver_Group>,
	Flags<[DriverOption, HelpHidden]>;
	def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
	def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
	def ccc_gcc_name : Separate<["-"], "ccc-gcc-name">, InternalDriverOpt,
	HelpText<"Name for native GCC compiler">,
	MetaVarName<"<gcc-path>">;

	class InternalDebugOpt : Group<internal_debug_Group>,
	Flags<[DriverOption, HelpHidden, CoreOption]>;
	def ccc_install_dir : Separate<["-"], "ccc-install-dir">, InternalDebugOpt,
	HelpText<"Simulate installation in the given directory">;
	def ccc_print_phases : Flag<["-"], "ccc-print-phases">, InternalDebugOpt,
	HelpText<"Dump list of actions to perform">;
	def ccc_print_bindings : Flag<["-"], "ccc-print-bindings">, InternalDebugOpt,
	HelpText<"Show bindings of tools to actions">;

	def ccc_arcmt_check : Flag<["-"], "ccc-arcmt-check">, InternalDriverOpt,
	HelpText<"Check for ARC migration issues that need manual handling">;
	def ccc_arcmt_modify : Flag<["-"], "ccc-arcmt-modify">, InternalDriverOpt,
	HelpText<"Apply modifications to files to conform to ARC">;
	def ccc_arcmt_migrate : Separate<["-"], "ccc-arcmt-migrate">, InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files that conform to ARC">;
	def arcmt_migrate_report_output : Separate<["-"], "arcmt-migrate-report-output">,
	HelpText<"Output path for the plist report">, Flags<[CC1Option]>;
	def arcmt_migrate_emit_arc_errors : Flag<["-"], "arcmt-migrate-emit-errors">,
	HelpText<"Emit ARC errors even if the migrator can fix them">,
	Flags<[CC1Option]>;
	def gen_reproducer: Flag<["-"], "gen-reproducer">, InternalDebugOpt,
	HelpText<"Auto-generates preprocessed source files and a reproduction script">;
	def gen_cdb_fragment_path: Separate<["-"], "gen-cdb-fragment-path">, InternalDebugOpt,
	HelpText<"Emit a compilation database fragment to the specified directory">;

	def _migrate : Flag<["--"], "migrate">, Flags<[DriverOption]>,
	HelpText<"Run the migrator">;
	def ccc_objcmt_migrate : Separate<["-"], "ccc-objcmt-migrate">,
	InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files to migrate to "
	"modern ObjC syntax">;
	def objcmt_migrate_literals : Flag<["-"], "objcmt-migrate-literals">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC literals">;
	def objcmt_migrate_subscripting : Flag<["-"], "objcmt-migrate-subscripting">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC subscripting">;
	def objcmt_migrate_property : Flag<["-"], "objcmt-migrate-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC property">;
	def objcmt_migrate_all : Flag<["-"], "objcmt-migrate-all">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC">;
	def objcmt_migrate_readonly_property : Flag<["-"], "objcmt-migrate-readonly-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readonly property">;
	def objcmt_migrate_readwrite_property : Flag<["-"], "objcmt-migrate-readwrite-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readwrite property">;
	def objcmt_migrate_property_dot_syntax : Flag<["-"], "objcmt-migrate-property-dot-syntax">, Flags<[CC1Option]>,
	HelpText<"Enable migration of setter/getter messages to property-dot syntax">;
	def objcmt_migrate_annotation : Flag<["-"], "objcmt-migrate-annotation">, Flags<[CC1Option]>,
	HelpText<"Enable migration to property and method annotations">;
	def objcmt_migrate_instancetype : Flag<["-"], "objcmt-migrate-instancetype">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer instancetype for method result type">;
	def objcmt_migrate_nsmacros : Flag<["-"], "objcmt-migrate-ns-macros">, Flags<[CC1Option]>,
	HelpText<"Enable migration to NS_ENUM/NS_OPTIONS macros">;
	def objcmt_migrate_protocol_conformance : Flag<["-"], "objcmt-migrate-protocol-conformance">, Flags<[CC1Option]>,
	HelpText<"Enable migration to add protocol conformance on classes">;
	def objcmt_atomic_property : Flag<["-"], "objcmt-atomic-property">, Flags<[CC1Option]>,
	HelpText<"Make migration to 'atomic' properties">;
	def objcmt_returns_innerpointer_property : Flag<["-"], "objcmt-returns-innerpointer-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to annotate property with NS_RETURNS_INNER_POINTER">;
	def objcmt_ns_nonatomic_iosonly: Flag<["-"], "objcmt-ns-nonatomic-iosonly">, Flags<[CC1Option]>,
	HelpText<"Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute">;
	def objcmt_migrate_designated_init : Flag<["-"], "objcmt-migrate-designated-init">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods">;
	def objcmt_whitelist_dir_path: Joined<["-"], "objcmt-whitelist-dir-path=">, Flags<[CC1Option]>,
	HelpText<"Only modify files with a filename contained in the provided directory path">;
	// The misspelt "white-list" [sic] alias is due for removal.
	def : Joined<["-"], "objcmt-white-list-dir-path=">, Flags<[CC1Option]>,
	Alias<objcmt_whitelist_dir_path>;

	// Make sure all other -ccc- options are rejected.
	def ccc_ : Joined<["-"], "ccc-">, Group<internal_Group>, Flags<[Unsupported]>;

	// Standard Options

	def _HASH_HASH_HASH : Flag<["-"], "###">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Print (but do not run) the commands to run for this compilation">;
	def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>,
	Flags<[DriverOption, CoreOption]>;
	def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<dir>">,
	HelpText<"Add <dir> to search path for binaries and object files used implicitly">;
	def CC : Flag<["-"], "CC">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments from within macros in preprocessed output">;
	def C : Flag<["-"], "C">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments in preprocessed output">;
	def D : JoinedOrSeparate<["-"], "D">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>=<value>">,
	HelpText<"Define <macro> to <value> (or 1 if <value> omitted)">;
	def E : Flag<["-"], "E">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run the preprocessor">;
	def F : JoinedOrSeparate<["-"], "F">, Flags<[RenderJoined,CC1Option]>,
	HelpText<"Add directory to framework include search path">;
	def G : JoinedOrSeparate<["-"], "G">, Flags<[DriverOption]>, Group<m_Group>,
	MetaVarName<"<size>">, HelpText<"Put objects of at most <size> bytes "
	"into small data section (MIPS / Hexagon)">;
	def G_EQ : Joined<["-"], "G=">, Flags<[DriverOption]>, Group<m_Group>, Alias<G>;
	def H : Flag<["-"], "H">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Show header includes and nesting depth">;
	def I_ : Flag<["-"], "I-">, Group<I_Group>,
	HelpText<"Restrict all prior -I flags to double-quoted inclusion and "
	"remove current directory from include path">;
	def I : JoinedOrSeparate<["-"], "I">, Group<I_Group>,
	Flags<[CC1Option,CC1AsOption]>, MetaVarName<"<dir>">,
	HelpText<"Add directory to include search path">;
	def L : JoinedOrSeparate<["-"], "L">, Flags<[RenderJoined]>, Group<Link_Group>,
	MetaVarName<"<dir>">, HelpText<"Add directory to library search path">;
	def MD : Flag<["-"], "MD">, Group<M_Group>,
	HelpText<"Write a depfile containing user and system headers">;
	def MMD : Flag<["-"], "MMD">, Group<M_Group>,
	HelpText<"Write a depfile containing user headers">;
	def M : Flag<["-"], "M">, Group<M_Group>,
	HelpText<"Like -MD, but also implies -E and writes to stdout by default">;
	def MM : Flag<["-"], "MM">, Group<M_Group>,
	HelpText<"Like -MMD, but also implies -E and writes to stdout by default">;
	def MF : JoinedOrSeparate<["-"], "MF">, Group<M_Group>,
	HelpText<"Write depfile output from -MMD, -MD, -MM, or -M to <file>">,
	MetaVarName<"<file>">;
	def MG : Flag<["-"], "MG">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Add missing headers to depfile">;
	def MJ : JoinedOrSeparate<["-"], "MJ">, Group<M_Group>,
	HelpText<"Write a compilation database entry per input">;
	def MP : Flag<["-"], "MP">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Create phony target for each dependency (other than main file)">;
	def MQ : JoinedOrSeparate<["-"], "MQ">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output to quote in depfile">;
	def MT : JoinedOrSeparate<["-"], "MT">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output in depfile">;
	def MV : Flag<["-"], "MV">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Use NMake/Jom format for the depfile">;
	def Mach : Flag<["-"], "Mach">, Group<Link_Group>;
	def O0 : Flag<["-"], "O0">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def O4 : Flag<["-"], "O4">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def ObjCXX : Flag<["-"], "ObjC++">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C++ inputs">;
	def ObjC : Flag<["-"], "ObjC">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C inputs">;
	def O : Joined<["-"], "O">, Group<O_Group>, Flags<[CC1Option]>;
	def O_flag : Flag<["-"], "O">, Flags<[CC1Option]>, Alias<O>, AliasArgs<["2"]>;
	def Ofast : Joined<["-"], "Ofast">, Group<O_Group>, Flags<[CC1Option]>;
	def P : Flag<["-"], "P">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Disable linemarker output in -E mode">;
	def Qy : Flag<["-"], "Qy">, Flags<[CC1Option]>,
	HelpText<"Emit metadata containing compiler name and version">;
	def Qn : Flag<["-"], "Qn">, Flags<[CC1Option]>,
	HelpText<"Do not emit metadata containing compiler name and version">;
	def : Flag<["-"], "fident">, Group<f_Group>, Alias<Qy>, Flags<[CC1Option]>;
	def : Flag<["-"], "fno-ident">, Group<f_Group>, Alias<Qn>, Flags<[CC1Option]>;
	def Qunused_arguments : Flag<["-"], "Qunused-arguments">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Don't emit warning for unused driver arguments">;
	def Q : Flag<["-"], "Q">, IgnoredGCCCompat;
	def Rpass_EQ : Joined<["-"], "Rpass=">, Group<R_value_Group>, Flags<[CC1Option]>,
	HelpText<"Report transformations performed by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_missed_EQ : Joined<["-"], "Rpass-missed=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report missed transformations by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_analysis_EQ : Joined<["-"], "Rpass-analysis=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report transformation analysis from optimization passes whose "
	"name matches the given POSIX regular expression">;
	def R_Joined : Joined<["-"], "R">, Group<R_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<remark>">, HelpText<"Enable the specified remark">;
	def S : Flag<["-"], "S">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run preprocess and compilation steps">;
	def Tbss : JoinedOrSeparate<["-"], "Tbss">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def Tdata : JoinedOrSeparate<["-"], "Tdata">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of DATA to <addr>">;
	def Ttext : JoinedOrSeparate<["-"], "Ttext">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of TEXT to <addr>">;
	def T : JoinedOrSeparate<["-"], "T">, Group<T_Group>,
	MetaVarName<"<script>">, HelpText<"Specify <script> as linker script">;
	def U : JoinedOrSeparate<["-"], "U">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>">, HelpText<"Undefine macro <macro>">;
	def V : JoinedOrSeparate<["-"], "V">, Flags<[DriverOption, Unsupported]>;
	def Wa_COMMA : CommaJoined<["-"], "Wa,">,
	HelpText<"Pass the comma separated arguments in <arg> to the assembler">,
	MetaVarName<"<arg>">;
	def Wall : Flag<["-"], "Wall">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def WCL4 : Flag<["-"], "WCL4">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>, Flags<[CC1Option]>,
	HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
	def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>, Flags<[CC1Option]>;
	def Wl_COMMA : CommaJoined<["-"], "Wl,">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass the comma separated arguments in <arg> to the linker">,
	MetaVarName<"<arg>">, Group<Link_Group>;
	// FIXME: This is broken; these should not be Joined arguments.
	def Wno_nonportable_cfstrings : Joined<["-"], "Wno-nonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wnonportable_cfstrings : Joined<["-"], "Wnonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wp_COMMA : CommaJoined<["-"], "Wp,">,
	HelpText<"Pass the comma separated arguments in <arg> to the preprocessor">,
	MetaVarName<"<arg>">, Group<Preprocessor_Group>;
	def Wwrite_strings : Flag<["-"], "Wwrite-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wno_write_strings : Flag<["-"], "Wno-write-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def W_Joined : Joined<["-"], "W">, Group<W_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
	def Xanalyzer : Separate<["-"], "Xanalyzer">,
	HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
	Group<StaticAnalyzer_Group>;
	def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[DriverOption]>;
	def Xassembler : Separate<["-"], "Xassembler">,
	HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
	Group<CompileOnly_Group>;
	def Xclang : Separate<["-"], "Xclang">,
	HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
	Flags<[DriverOption, CoreOption]>, Group<CompileOnly_Group>;
	def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
	HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
	def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
	HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
	def Xopenmp_target : Separate<["-"], "Xopenmp-target">,
	HelpText<"Pass <arg> to the target offloading toolchain.">, MetaVarName<"<arg>">;
	def Xopenmp_target_EQ : JoinedAndSeparate<["-"], "Xopenmp-target=">,
	HelpText<"Pass <arg> to the target offloading toolchain identified by <triple>.">,
	MetaVarName<"<triple> <arg>">;
	def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xpreprocessor : Separate<["-"], "Xpreprocessor">, Group<Preprocessor_Group>,
	HelpText<"Pass <arg> to the preprocessor">, MetaVarName<"<arg>">;
	def X_Flag : Flag<["-"], "X">, Group<Link_Group>;
	def X_Joined : Joined<["-"], "X">, IgnoredGCCCompat;
	def Z_Flag : Flag<["-"], "Z">, Group<Link_Group>;
	// FIXME: All we do with this is reject it. Remove.
	def Z_Joined : Joined<["-"], "Z">;
	def all__load : Flag<["-"], "all_load">;
	def allowable__client : Separate<["-"], "allowable_client">;
	def ansi : Flag<["-", "--"], "ansi">;
	def arch__errors__fatal : Flag<["-"], "arch_errors_fatal">;
	def arch : Separate<["-"], "arch">, Flags<[DriverOption]>;
	def arch__only : Separate<["-"], "arch_only">;
	def a : Joined<["-"], "a">;
	def autocomplete : Joined<["--"], "autocomplete=">;
	def bind__at__load : Flag<["-"], "bind_at_load">;
	def bundle__loader : Separate<["-"], "bundle_loader">;
	def bundle : Flag<["-"], "bundle">;
	def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
	def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
	def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option is added for compatibility with OpenCL 1.0.">;
	def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
	def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
	def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Generate kernel argument metadata.">;
	def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow unsafe floating-point optimizations. Also implies -cl-no-signed-zeros and -cl-mad-enable.">;
	def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.">;
	def cl_mad_enable : Flag<["-"], "cl-mad-enable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise MAD computations in the generated binary.">;
	def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
	def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,clc++,CLC++">;
	def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
	def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
	def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">;
	def client__name : JoinedOrSeparate<["-"], "client_name">;
	def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
	def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
	def config : Separate<["--"], "config">, Flags<[DriverOption]>,
	HelpText<"Specifies configuration file">;
	def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOption, HelpHidden]>,
	HelpText<"System directory for configuration files">;
	def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>,
	HelpText<"User directory for configuration files">;
	def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>;
	def cpp_precomp : Flag<["-"], "cpp-precomp">, Group<clang_ignored_f_Group>;
	def current__version : JoinedOrSeparate<["-"], "current_version">;
	def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>,
	HelpText<"Add directory to the C++ SYSTEM include search path">, Flags<[CC1Option]>,
	MetaVarName<"<directory>">;
	def c : Flag<["-"], "c">, Flags<[DriverOption]>, Group<Action_Group>,
	HelpText<"Only run preprocess, compile, and assemble steps">;
	def fconvergent_functions : Flag<["-"], "fconvergent-functions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assume functions may be convergent">;

	def cuda_device_only : Flag<["--"], "cuda-device-only">,
	HelpText<"Compile CUDA code for device only">;
	def cuda_host_only : Flag<["--"], "cuda-host-only">,
	HelpText<"Compile CUDA code for host only. Has no effect on non-CUDA "
	"compilations.">;
	def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
	HelpText<"Compile CUDA code for both host and device (default). Has no "
	"effect on non-CUDA compilations.">;
	def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">, Flags<[DriverOption]>,
	HelpText<"Include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
	def no_cuda_include_ptx_EQ : Joined<["--"], "no-cuda-include-ptx=">, Flags<[DriverOption]>,
	HelpText<"Do not include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
	def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
	HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">;
	def hip_link : Flag<["--"], "hip-link">,
	HelpText<"Link clang-offload-bundler bundles for HIP">;
	def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
	HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. "
	"'all' resets the list to its default value.">;
	def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
	HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
	def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
	HelpText<"Don't error out if the detected version of the CUDA install is "
	"too low for the requested CUDA gpu architecture.">;
	def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
	def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
	HelpText<"CUDA installation path">;
	def cuda_path_ignore_env : Flag<["--"], "cuda-path-ignore-env">, Group<i_Group>,
	HelpText<"Ignore environment variables to detect CUDA installation">;
	def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
	HelpText<"Path to ptxas (used for compiling CUDA code)">;
	def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
	Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
	def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
	def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
	Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
	def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
	def fgpu_rdc : Flag<["-"], "fgpu-rdc">, Flags<[CC1Option]>,
	HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
	def fno_gpu_rdc : Flag<["-"], "fno-gpu-rdc">;
	def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
	def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
	def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
	HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
	def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
	def hip_device_lib_path_EQ : Joined<["--"], "hip-device-lib-path=">, Group<Link_Group>,
	HelpText<"HIP device library path">;
	def hip_device_lib_EQ : Joined<["--"], "hip-device-lib=">, Group<Link_Group>,
	HelpText<"HIP device library">;
	def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
	Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
	def fhip_new_launch_api : Flag<["-"], "fhip-new-launch-api">,
	Flags<[CC1Option]>, HelpText<"Use new kernel launching API for HIP.">;
	def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">;
	def fgpu_allow_device_init : Flag<["-"], "fgpu-allow-device-init">,
	Flags<[CC1Option]>, HelpText<"Allow device side init function in HIP">;
	def fno_gpu_allow_device_init : Flag<["-"], "fno-gpu-allow-device-init">;
	def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
	Flags<[CC1Option]>,
	HelpText<"Default max threads per block for kernel launch bounds for HIP">;
	def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
	HelpText<"Path to libomptarget-nvptx libraries">;
	def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode in addition to normal output">;
	def dI : Flag<["-"], "dI">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print include directives in -E mode in addition to normal output">;
	def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode instead of normal output">;
	def dead__strip : Flag<["-"], "dead_strip">;
	def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,
	HelpText<"Filename (or -) to write dependency output to">;
	def dependency_dot : Separate<["-"], "dependency-dot">, Flags<[CC1Option]>,
	HelpText<"Filename to write DOT-formatted header dependencies to">;
	def module_dependency_dir : Separate<["-"], "module-dependency-dir">,
	Flags<[CC1Option]>, HelpText<"Directory to dump module dependencies to">;
	def dumpmachine : Flag<["-"], "dumpmachine">;
	def dumpspecs : Flag<["-"], "dumpspecs">, Flags<[Unsupported]>;
	def dumpversion : Flag<["-"], "dumpversion">;
	def dylib__file : Separate<["-"], "dylib_file">;
	def dylinker__install__name : JoinedOrSeparate<["-"], "dylinker_install_name">;
	def dylinker : Flag<["-"], "dylinker">;
	def dynamiclib : Flag<["-"], "dynamiclib">;
	def dynamic : Flag<["-"], "dynamic">, Flags<[NoArgumentUnused]>;
	def d_Flag : Flag<["-"], "d">, Group<d_Group>;
	def d_Joined : Joined<["-"], "d">, Group<d_Group>;
	def emit_ast : Flag<["-"], "emit-ast">,
	HelpText<"Emit Clang AST files for source inputs">;
	def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Use the LLVM representation for assembler and object files">;
	def emit_interface_stubs : Flag<["-"], "emit-interface-stubs">, Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Generate Inteface Stub Files.">;
	def emit_merged_ifs : Flag<["-"], "emit-merged-ifs">,
	Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Generate Interface Stub Files, emit merged text not binary.">;
	def interface_stub_version_EQ : JoinedOrSeparate<["-"], "interface-stub-version=">, Flags<[CC1Option]>;
	def exported__symbols__list : Separate<["-"], "exported_symbols_list">;
	def e : JoinedOrSeparate<["-"], "e">, Group<Link_Group>;
	def fPIC : Flag<["-"], "fPIC">, Group<f_Group>;
	def fno_PIC : Flag<["-"], "fno-PIC">, Group<f_Group>;
	def fPIE : Flag<["-"], "fPIE">, Group<f_Group>;
	def fno_PIE : Flag<["-"], "fno-PIE">, Group<f_Group>;
	def faccess_control : Flag<["-"], "faccess-control">, Group<f_Group>;
	def falign_functions : Flag<["-"], "falign-functions">, Group<f_Group>;
	def falign_functions_EQ : Joined<["-"], "falign-functions=">, Group<f_Group>;
	def fno_align_functions: Flag<["-"], "fno-align-functions">, Group<f_Group>;
	def fallow_unsupported : Flag<["-"], "fallow-unsupported">, Group<f_Group>;
	def fapple_kext : Flag<["-"], "fapple-kext">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use Apple's kernel extensions ABI">;
	def fapple_pragma_pack : Flag<["-"], "fapple-pragma-pack">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable Apple gcc-compatible #pragma pack handling">;
	def shared_libsan : Flag<["-"], "shared-libsan">,
	HelpText<"Dynamically link the sanitizer runtime">;
	def static_libsan : Flag<["-"], "static-libsan">,
	HelpText<"Statically link the sanitizer runtime">;
	def : Flag<["-"], "shared-libasan">, Alias<shared_libsan>;
	def fasm : Flag<["-"], "fasm">, Group<f_Group>;

	def fasm_blocks : Flag<["-"], "fasm-blocks">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_asm_blocks : Flag<["-"], "fno-asm-blocks">, Group<f_Group>;

	def fassume_sane_operator_new : Flag<["-"], "fassume-sane-operator-new">, Group<f_Group>;
	def fastcp : Flag<["-"], "fastcp">, Group<f_Group>;
	def fastf : Flag<["-"], "fastf">, Group<f_Group>;
	def fast : Flag<["-"], "fast">, Group<f_Group>;
	def fasynchronous_unwind_tables : Flag<["-"], "fasynchronous-unwind-tables">, Group<f_Group>;

	def fdouble_square_bracket_attributes : Flag<[ "-" ], "fdouble-square-bracket-attributes">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable '[[]]' attributes in all C and C++ language modes">;
	def fno_double_square_bracket_attributes : Flag<[ "-" ], "fno-double-square-bracket-attributes">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable '[[]]' attributes in all C and C++ language modes">;

	def fautolink : Flag <["-"], "fautolink">, Group<f_Group>;
	def fno_autolink : Flag <["-"], "fno-autolink">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable generation of linker directives for automatic library linking">;

	// C++ Coroutines TS
	def fcoroutines_ts : Flag <["-"], "fcoroutines-ts">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable support for the C++ Coroutines TS">;
	def fno_coroutines_ts : Flag <["-"], "fno-coroutines-ts">, Group<f_Group>,
	Flags<[DriverOption]>;

	def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option, CC1AsOption]>, MetaVarName<"<option>">,
	HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">;
	def fembed_bitcode : Flag<["-"], "fembed-bitcode">, Group<f_Group>,
	Alias<fembed_bitcode_EQ>, AliasArgs<["all"]>,
	HelpText<"Embed LLVM IR bitcode as data">;
	def fembed_bitcode_marker : Flag<["-"], "fembed-bitcode-marker">,
	Alias<fembed_bitcode_EQ>, AliasArgs<["marker"]>,
	HelpText<"Embed placeholder LLVM IR data as a marker">;
	def fgnu_inline_asm : Flag<["-"], "fgnu-inline-asm">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_gnu_inline_asm : Flag<["-"], "fno-gnu-inline-asm">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable GNU style inline asm">;

	def fprofile_sample_use : Flag<["-"], "fprofile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_profile_sample_use : Flag<["-"], "fno-profile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_sample_use_EQ : Joined<["-"], "fprofile-sample-use=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable sample-based profile guided optimizations">;
	def fprofile_sample_accurate : Flag<["-"], "fprofile-sample-accurate">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specifies that the sample profile is accurate">,
	DocBrief<[{Specifies that the sample profile is accurate. If the sample
	profile is accurate, callsites without profile samples are marked
	as cold. Otherwise, treat callsites without profile samples as if
	we have no profile}]>;
	def fno_profile_sample_accurate : Flag<["-"], "fno-profile-sample-accurate">,
	Group<f_Group>, Flags<[DriverOption]>;
	def fauto_profile : Flag<["-"], "fauto-profile">, Group<f_Group>,
	Alias<fprofile_sample_use>;
	def fno_auto_profile : Flag<["-"], "fno-auto-profile">, Group<f_Group>,
	Alias<fno_profile_sample_use>;
	def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
	Alias<fprofile_sample_use_EQ>;
	def fauto_profile_accurate : Flag<["-"], "fauto-profile-accurate">,
	Group<f_Group>, Alias<fprofile_sample_accurate>;
	def fno_auto_profile_accurate : Flag<["-"], "fno-auto-profile-accurate">,
	Group<f_Group>, Alias<fno_profile_sample_accurate>;
	def fdebug_compilation_dir : Separate<["-"], "fdebug-compilation-dir">,
	Group<f_Group>, Flags<[CC1Option, CC1AsOption, CoreOption]>,
	HelpText<"The compilation directory to embed in the debug info.">;
	def fdebug_compilation_dir_EQ : Joined<["-"], "fdebug-compilation-dir=">,
	Group<f_Group>, Flags<[CC1Option, CC1AsOption, CoreOption]>,
	Alias<fdebug_compilation_dir>;
	def fdebug_info_for_profiling : Flag<["-"], "fdebug-info-for-profiling">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Emit extra debug info to make sample profile more accurate.">;
	def fno_debug_info_for_profiling : Flag<["-"], "fno-debug-info-for-profiling">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Do not emit extra debug info for sample profiler.">;
	def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<file>">,
	HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Use instrumentation data for profile-guided optimization">;
	def fprofile_remapping_file_EQ : Joined<["-"], "fprofile-remapping-file=">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>, MetaVarName<"<file>">,
	HelpText<"Use the remappings described in <file> to match the profile data against names in the program">;
	def fprofile_remapping_file : Separate<["-"], "fprofile-remapping-file">,
	Group<f_Group>, Flags<[CoreOption]>, Alias<fprofile_remapping_file_EQ>;
	def fcoverage_mapping : Flag<["-"], "fcoverage-mapping">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Generate coverage mapping to enable code coverage analysis">;
	def fno_coverage_mapping : Flag<["-"], "fno-coverage-mapping">,
	Group<f_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Disable code coverage analysis">;
	def fprofile_generate : Flag<["-"], "fprofile-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<directory>">,
	HelpText<"Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fcs_profile_generate : Flag<["-"], "fcs-profile-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect context sensitive execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fcs_profile_generate_EQ : Joined<["-"], "fcs-profile-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<directory>">,
	HelpText<"Generate instrumented code to collect context sensitive execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
	Alias<fprofile_instr_use>;
	def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
	Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<pathname>">,
	HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
	def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_generate : Flag<["-"], "fno-profile-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_instr_use : Flag<["-"], "fno-profile-instr-use">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Disable using instrumentation data for profile-guided optimization">;
	def fno_profile_use : Flag<["-"], "fno-profile-use">,
	Alias<fno_profile_instr_use>;
	def fprofile_filter_files_EQ : Joined<["-"], "fprofile-filter-files=">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Instrument only functions from files where names match any regex separated by a semi-colon">;
	def fprofile_exclude_files_EQ : Joined<["-"], "fprofile-exclude-files=">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Instrument only functions from files where names don't match all the regexes separated by a semi-colon">;
	def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;

	def faddrsig : Flag<["-"], "faddrsig">, Group<f_Group>, Flags<[CoreOption, CC1Option]>,
	HelpText<"Emit an address-significance table">;
	def fno_addrsig : Flag<["-"], "fno-addrsig">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Don't emit an address-significance table">;
	def fblocks : Flag<["-"], "fblocks">, Group<f_Group>, Flags<[CoreOption, CC1Option]>,
	HelpText<"Enable the 'blocks' language feature">;
	def fbootclasspath_EQ : Joined<["-"], "fbootclasspath=">, Group<f_Group>;
	def fborland_extensions : Flag<["-"], "fborland-extensions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Accept non-standard constructs supported by the Borland compiler">;
	def fbuiltin : Flag<["-"], "fbuiltin">, Group<f_Group>, Flags<[CoreOption]>;
	def fbuiltin_module_map : Flag <["-"], "fbuiltin-module-map">, Group<f_Group>,
	Flags<[DriverOption]>, HelpText<"Load the clang builtins module map file.">;
	def fcaret_diagnostics : Flag<["-"], "fcaret-diagnostics">, Group<f_Group>;
	def fclang_abi_compat_EQ : Joined<["-"], "fclang-abi-compat=">, Group<f_clang_Group>,
	Flags<[CC1Option]>, MetaVarName<"<version>">, Values<"<major>.<minor>,latest">,
	HelpText<"Attempt to match the ABI of Clang <version>">;
	def fclasspath_EQ : Joined<["-"], "fclasspath=">, Group<f_Group>;
	def fcolor_diagnostics : Flag<["-"], "fcolor-diagnostics">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use colors in diagnostics">;
	def fdiagnostics_color : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
	def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use ANSI escape codes for diagnostics">;
	def fcomment_block_commands : CommaJoined<["-"], "fcomment-block-commands=">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Treat each comma separated argument in <arg> as a documentation comment block command">,
	MetaVarName<"<arg>">;
	def fparse_all_comments : Flag<["-"], "fparse-all-comments">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def frecord_command_line : Flag<["-"], "frecord-command-line">,
	Group<f_clang_Group>;
	def fno_record_command_line : Flag<["-"], "fno-record-command-line">,
	Group<f_clang_Group>;
	def : Flag<["-"], "frecord-gcc-switches">, Alias<frecord_command_line>;
	def : Flag<["-"], "fno-record-gcc-switches">, Alias<fno_record_command_line>;
	def fcommon : Flag<["-"], "fcommon">, Group<f_Group>;
	def fcompile_resource_EQ : Joined<["-"], "fcompile-resource=">, Group<f_Group>;
	def fcomplete_member_pointers : Flag<["-"], "fcomplete-member-pointers">, Group<f_clang_Group>,
	Flags<[CoreOption, CC1Option]>,
	HelpText<"Require member pointer base types to be complete if they would be significant under the Microsoft ABI">;
	def fno_complete_member_pointers : Flag<["-"], "fno-complete-member-pointers">, Group<f_clang_Group>,
	Flags<[CoreOption]>,
	HelpText<"Do not require member pointer base types to be complete if they would be significant under the Microsoft ABI">;
	def fcf_runtime_abi_EQ : Joined<["-"], "fcf-runtime-abi=">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fconstant_cfstrings : Flag<["-"], "fconstant-cfstrings">, Group<f_Group>;
	def fconstant_string_class_EQ : Joined<["-"], "fconstant-string-class=">, Group<f_Group>;
	def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
	def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
	def fexperimental_new_constant_interpreter : Flag<["-"], "fexperimental-new-constant-interpreter">, Group<f_Group>,
	HelpText<"Enable the experimental new constant interpreter">, Flags<[CC1Option]>;
	def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
	Group<f_Group>;
	def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused, CoreOption]>,
	HelpText<"Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash">;
	def fcrash_diagnostics_dir : Joined<["-"], "fcrash-diagnostics-dir=">, Group<f_clang_Group>, Flags<[NoArgumentUnused, CoreOption]>;
	def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
	def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group<f_Group>,
	HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>;
	def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fdebug_pass_arguments : Flag<["-"], "fdebug-pass-arguments">, Group<f_Group>;
	def fdebug_pass_structure : Flag<["-"], "fdebug-pass-structure">, Group<f_Group>;
	def fdepfile_entry : Joined<["-"], "fdepfile-entry=">,
	Group<f_clang_Group>, Flags<[CC1Option]>;
	def fdiagnostics_fixit_info : Flag<["-"], "fdiagnostics-fixit-info">, Group<f_clang_Group>;
	def fdiagnostics_parseable_fixits : Flag<["-"], "fdiagnostics-parseable-fixits">, Group<f_clang_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Print fix-its in machine parseable form">;
	def fdiagnostics_print_source_range_info : Flag<["-"], "fdiagnostics-print-source-range-info">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Print source range spans in numeric form">;
	def fdiagnostics_show_hotness : Flag<["-"], "fdiagnostics-show-hotness">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable profile hotness information in diagnostic line">;
	def fdiagnostics_hotness_threshold_EQ : Joined<["-"], "fdiagnostics-hotness-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
	HelpText<"Prevent optimization remarks from being output if they do not have at least this profile count">;
	def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Print option name with mappable diagnostics">;
	def fdiagnostics_show_note_include_stack : Flag<["-"], "fdiagnostics-show-note-include-stack">,
	Group<f_Group>, Flags<[CC1Option]>, HelpText<"Display include stacks for diagnostic notes">;
	def fdiagnostics_format_EQ : Joined<["-"], "fdiagnostics-format=">, Group<f_clang_Group>;
	def fdiagnostics_show_category_EQ : Joined<["-"], "fdiagnostics-show-category=">, Group<f_clang_Group>;
	def fdiagnostics_show_template_tree : Flag<["-"], "fdiagnostics-show-template-tree">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Print a template comparison tree for differing templates">;
	def fdeclspec : Flag<["-"], "fdeclspec">, Group<f_clang_Group>,
	HelpText<"Allow __declspec as a keyword">, Flags<[CC1Option]>;
	def fdiscard_value_names : Flag<["-"], "fdiscard-value-names">, Group<f_clang_Group>,
	HelpText<"Discard value names in LLVM IR">, Flags<[DriverOption]>;
	def fno_discard_value_names : Flag<["-"], "fno-discard-value-names">, Group<f_clang_Group>,
	HelpText<"Do not discard value names in LLVM IR">, Flags<[DriverOption]>;
	def fdollars_in_identifiers : Flag<["-"], "fdollars-in-identifiers">, Group<f_Group>,
	HelpText<"Allow '$' in identifiers">, Flags<[CC1Option]>;
	def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fdwarf_directory_asm : Flag<["-"], "fdwarf-directory-asm">, Group<f_Group>;
	def fno_dwarf_directory_asm : Flag<["-"], "fno-dwarf-directory-asm">, Group<f_Group>, Flags<[CC1Option]>;
	def felide_constructors : Flag<["-"], "felide-constructors">, Group<f_Group>;
	def fno_elide_type : Flag<["-"], "fno-elide-type">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Do not elide types when printing diagnostics">;
	def feliminate_unused_debug_symbols : Flag<["-"], "feliminate-unused-debug-symbols">, Group<f_Group>;
	def femit_all_decls : Flag<["-"], "femit-all-decls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Emit all declarations, even if unused">;
	def femulated_tls : Flag<["-"], "femulated-tls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use emutls functions to access thread_local variables">;
	def fno_emulated_tls : Flag<["-"], "fno-emulated-tls">, Group<f_Group>, Flags<[CC1Option]>;
	def fencoding_EQ : Joined<["-"], "fencoding=">, Group<f_Group>;
	def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<f_Group>, Flags<[CoreOption]>;
	def fexceptions : Flag<["-"], "fexceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable support for exception handling">;
	def fdwarf_exceptions : Flag<["-"], "fdwarf-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use DWARF style exceptions">;
	def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SjLj style exceptions">;
	def fseh_exceptions : Flag<["-"], "fseh-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SEH style exceptions">;
	def fwasm_exceptions : Flag<["-"], "fwasm-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use WebAssembly style exceptions">;
	def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-expensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def fextdirs_EQ : Joined<["-"], "fextdirs=">, Group<f_Group>;
	def : Flag<["-"], "fdefer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-defer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
	def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
	def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
	def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
	def ffp_model_EQ : Joined<["-"], "ffp-model=">, Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Controls the semantics of floating-point calculations.">;
	def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specifies the exception behavior of floating-point operations.">;
	def ffast_math : Flag<["-"], "ffast-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow aggressive, lossy floating-point optimizations">;
	def fno_fast_math : Flag<["-"], "fno-fast-math">, Group<f_Group>;
	def fmath_errno : Flag<["-"], "fmath-errno">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Require math functions to indicate errors by setting errno">;
	def fno_math_errno : Flag<["-"], "fno-math-errno">, Group<f_Group>;
	def fbracket_depth_EQ : Joined<["-"], "fbracket-depth=">, Group<f_Group>, Flags<[CoreOption]>;
	def fsignaling_math : Flag<["-"], "fsignaling-math">, Group<f_Group>;
	def fno_signaling_math : Flag<["-"], "fno-signaling-math">, Group<f_Group>;
	def fjump_tables : Flag<["-"], "fjump-tables">, Group<f_Group>;
	def fno_jump_tables : Flag<["-"], "fno-jump-tables">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not use jump tables for lowering switches">;
	def fforce_enable_int128 : Flag<["-"], "fforce-enable-int128">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable support for int128_t type">;
	def fno_force_enable_int128 : Flag<["-"], "fno-force-enable-int128">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable support for int128_t type">;
	def fkeep_static_consts : Flag<["-"], "fkeep-static-consts">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Keep static const variables even if unused">;
	def ffixed_point : Flag<["-"], "ffixed-point">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable fixed point types">;
	def fno_fixed_point : Flag<["-"], "fno-fixed-point">, Group<f_Group>,
	HelpText<"Disable fixed point types">;
	def fcxx_static_destructors : Flag<["-"], "fc++-static-destructors">,
	Group<f_Group>,
	HelpText<"Enable C++ static destructor registration (the default)">;
	def fno_cxx_static_destructors : Flag<["-"], "fno-c++-static-destructors">,
	Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Disable C++ static destructor registration">;
	def fsymbol_partition_EQ : Joined<["-"], "fsymbol-partition=">, Group<f_Group>,
	Flags<[CC1Option]>;

	// Begin sanitizer flags. These should all be core options exposed in all driver
	// modes.
	let Flags = [CC1Option, CoreOption] in {

	def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group<f_clang_Group>,
	MetaVarName<"<check>">,
	HelpText<"Turn on runtime checks for various forms of undefined "
	"or suspicious behavior. See user manual for available checks">;
	def fno_sanitize_EQ : CommaJoined<["-"], "fno-sanitize=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fsanitize_blacklist : Joined<["-"], "fsanitize-blacklist=">,
	Group<f_clang_Group>,
	HelpText<"Path to blacklist file for sanitizers">;
	def fsanitize_system_blacklist : Joined<["-"], "fsanitize-system-blacklist=">,
	HelpText<"Path to system blacklist file for sanitizers">,
	Flags<[CC1Option]>;
	def fno_sanitize_blacklist : Flag<["-"], "fno-sanitize-blacklist">,
	Group<f_clang_Group>,
	HelpText<"Don't use blacklist file for sanitizers">;
	def fsanitize_coverage
	: CommaJoined<["-"], "fsanitize-coverage=">,
	Group<f_clang_Group>,
	HelpText<"Specify the type of coverage instrumentation for Sanitizers">;
	def fno_sanitize_coverage
	: CommaJoined<["-"], "fno-sanitize-coverage=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable specified features of coverage instrumentation for "
	"Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep,8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters">;
	def fsanitize_memory_track_origins_EQ : Joined<["-"], "fsanitize-memory-track-origins=">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fsanitize_memory_track_origins : Flag<["-"], "fsanitize-memory-track-origins">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fno_sanitize_memory_track_origins : Flag<["-"], "fno-sanitize-memory-track-origins">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable origins tracking in MemorySanitizer">;
	def fsanitize_memory_use_after_dtor : Flag<["-"], "fsanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-destroy detection in MemorySanitizer">;
	def fno_sanitize_memory_use_after_dtor : Flag<["-"], "fno-sanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Disable use-after-destroy detection in MemorySanitizer">;
	def fsanitize_address_field_padding : Joined<["-"], "fsanitize-address-field-padding=">,
	Group<f_clang_Group>,
	HelpText<"Level of field padding for AddressSanitizer">;
	def fsanitize_address_use_after_scope : Flag<["-"], "fsanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-scope detection in AddressSanitizer">;
	def fno_sanitize_address_use_after_scope : Flag<["-"], "fno-sanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable use-after-scope detection in AddressSanitizer">;
	def fsanitize_address_poison_custom_array_cookie
	: Flag<[ "-" ], "fsanitize-address-poison-custom-array-cookie">,
	Group<f_clang_Group>,
	HelpText<"Enable poisoning array cookies when using custom operator new[] in AddressSanitizer">;
	def fno_sanitize_address_poison_custom_array_cookie
	: Flag<[ "-" ], "fno-sanitize-address-poison-custom-array-cookie">,
	Group<f_clang_Group>,
	HelpText<"Disable poisoning array cookies when using custom operator new[] in AddressSanitizer">;
	def fsanitize_address_globals_dead_stripping : Flag<["-"], "fsanitize-address-globals-dead-stripping">,
	Group<f_clang_Group>,
	HelpText<"Enable linker dead stripping of globals in AddressSanitizer">;
	def fsanitize_address_use_odr_indicator
	: Flag<["-"], "fsanitize-address-use-odr-indicator">,
	Group<f_clang_Group>,
	HelpText<"Enable ODR indicator globals to avoid false ODR violation reports in partially sanitized programs at the cost of an increase in binary size">;
	def fno_sanitize_address_use_odr_indicator
	: Flag<["-"], "fno-sanitize-address-use-odr-indicator">,
	Group<f_clang_Group>,
	HelpText<"Disable ODR indicator globals">;
	// Note: This flag was introduced when it was necessary to distinguish between
	// ABI for correct codegen. This is no longer needed, but the flag is
	// not removed since targeting either ABI will behave the same.
	// This way we cause no disturbance to existing scripts & code, and if we
	// want to use this flag in the future we will cause no disturbance then
	// either.
	def fsanitize_hwaddress_abi_EQ
	: Joined<["-"], "fsanitize-hwaddress-abi=">,
	Group<f_clang_Group>,
	HelpText<"Select the HWAddressSanitizer ABI to target (interceptor or platform, default interceptor). This option is currently unused.">;
	def fsanitize_recover : Flag<["-"], "fsanitize-recover">, Group<f_clang_Group>;
	def fno_sanitize_recover : Flag<["-"], "fno-sanitize-recover">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>;
	def fsanitize_recover_EQ : CommaJoined<["-"], "fsanitize-recover=">,
	Group<f_clang_Group>,
	HelpText<"Enable recovery for specified sanitizers">;
	def fno_sanitize_recover_EQ
	: CommaJoined<["-"], "fno-sanitize-recover=">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable recovery for specified sanitizers">;
	def fsanitize_trap_EQ : CommaJoined<["-"], "fsanitize-trap=">, Group<f_clang_Group>,
	HelpText<"Enable trapping for specified sanitizers">;
	def fno_sanitize_trap_EQ : CommaJoined<["-"], "fno-sanitize-trap=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable trapping for specified sanitizers">;
	def fsanitize_undefined_trap_on_error : Flag<["-"], "fsanitize-undefined-trap-on-error">,
	Group<f_clang_Group>;
	def fno_sanitize_undefined_trap_on_error : Flag<["-"], "fno-sanitize-undefined-trap-on-error">,
	Group<f_clang_Group>;
	def fsanitize_minimal_runtime : Flag<["-"], "fsanitize-minimal-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_minimal_runtime : Flag<["-"], "fno-sanitize-minimal-runtime">,
	Group<f_clang_Group>;
	def fsanitize_link_runtime : Flag<["-"], "fsanitize-link-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_link_runtime : Flag<["-"], "fno-sanitize-link-runtime">,
	Group<f_clang_Group>;
	def fsanitize_link_cxx_runtime : Flag<["-"], "fsanitize-link-c++-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_link_cxx_runtime : Flag<["-"], "fno-sanitize-link-c++-runtime">,
	Group<f_clang_Group>;
	def fsanitize_cfi_cross_dso : Flag<["-"], "fsanitize-cfi-cross-dso">,
	Group<f_clang_Group>,
	HelpText<"Enable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fno_sanitize_cfi_cross_dso : Flag<["-"], "fno-sanitize-cfi-cross-dso">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>,
	HelpText<"Disable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fsanitize_cfi_icall_generalize_pointers : Flag<["-"], "fsanitize-cfi-icall-generalize-pointers">,
	Group<f_clang_Group>,
	HelpText<"Generalize pointers in CFI indirect call type signature checks">;
	def fsanitize_cfi_canonical_jump_tables : Flag<["-"], "fsanitize-cfi-canonical-jump-tables">,
	Group<f_clang_Group>,
	HelpText<"Make the jump table addresses canonical in the symbol table">;
	def fno_sanitize_cfi_canonical_jump_tables : Flag<["-"], "fno-sanitize-cfi-canonical-jump-tables">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Do not make the jump table addresses canonical in the symbol table">;
	def fsanitize_stats : Flag<["-"], "fsanitize-stats">,
	Group<f_clang_Group>,
	HelpText<"Enable sanitizer statistics gathering.">;
	def fno_sanitize_stats : Flag<["-"], "fno-sanitize-stats">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable sanitizer statistics gathering.">;
	def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">,
	Group<f_clang_Group>,
	HelpText<"Enable memory access instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_memory_access : Flag<["-"], "fno-sanitize-thread-memory-access">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable memory access instrumentation in ThreadSanitizer">;
	def fsanitize_thread_func_entry_exit : Flag<["-"], "fsanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	HelpText<"Enable function entry/exit instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_func_entry_exit : Flag<["-"], "fno-sanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable function entry/exit instrumentation in ThreadSanitizer">;
	def fsanitize_thread_atomics : Flag<["-"], "fsanitize-thread-atomics">,
	Group<f_clang_Group>,
	HelpText<"Enable atomic operations instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_atomics : Flag<["-"], "fno-sanitize-thread-atomics">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable atomic operations instrumentation in ThreadSanitizer">;
	def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-undefined-strip-path-components=">,
	Group<f_clang_Group>, MetaVarName<"<number>">,
	HelpText<"Strip (or keep only, if negative) a given number of path components "
	"when emitting check metadata.">;

	} // end -f[no-]sanitize* flags

	def funsafe_math_optimizations : Flag<["-"], "funsafe-math-optimizations">,
	Group<f_Group>;
	def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">,
	Group<f_Group>;
	def fassociative_math : Flag<["-"], "fassociative-math">, Group<f_Group>;
	def fno_associative_math : Flag<["-"], "fno-associative-math">, Group<f_Group>;
	def freciprocal_math :
	Flag<["-"], "freciprocal-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow division operations to be reassociated">;
	def fno_reciprocal_math : Flag<["-"], "fno-reciprocal-math">, Group<f_Group>;
	def ffinite_math_only : Flag<["-"], "ffinite-math-only">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_finite_math_only : Flag<["-"], "fno-finite-math-only">, Group<f_Group>;
	def fsigned_zeros : Flag<["-"], "fsigned-zeros">, Group<f_Group>;
	def fno_signed_zeros :
	Flag<["-"], "fno-signed-zeros">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
	def fhonor_nans : Flag<["-"], "fhonor-nans">, Group<f_Group>;
	def fno_honor_nans : Flag<["-"], "fno-honor-nans">, Group<f_Group>;
	def fhonor_infinities : Flag<["-"], "fhonor-infinities">, Group<f_Group>;
	def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<f_Group>;
	// This option was originally misspelt "infinites" [sic].
	def : Flag<["-"], "fhonor-infinites">, Alias<fhonor_infinities>;
	def : Flag<["-"], "fno-honor-infinites">, Alias<fno_honor_infinities>;
	def frounding_math : Flag<["-"], "frounding-math">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_rounding_math : Flag<["-"], "fno-rounding-math">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
	" \| on (according to FP_CONTRACT pragma) \| off (never fuse). Default"
	" is 'fast' for CUDA/HIP and 'on' otherwise.">, Values<"fast,on,off">;

	def fstrict_float_cast_overflow : Flag<["-"],
	"fstrict-float-cast-overflow">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assume that overflowing float-to-int casts are undefined (default)">;
	def fno_strict_float_cast_overflow : Flag<["-"],
	"fno-strict-float-cast-overflow">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Relax language rules and try to match the behavior of the target's native float-to-int conversion instructions">;

	def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
	def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;

	def frewrite_includes : Flag<["-"], "frewrite-includes">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_rewrite_includes : Flag<["-"], "fno-rewrite-includes">, Group<f_Group>;

	def frewrite_imports : Flag<["-"], "frewrite-imports">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_rewrite_imports : Flag<["-"], "fno-rewrite-imports">, Group<f_Group>;

	def fdelete_null_pointer_checks : Flag<["-"],
	"fdelete-null-pointer-checks">, Group<f_Group>,
	HelpText<"Treat usage of null pointers as undefined behavior.">;
	def fno_delete_null_pointer_checks : Flag<["-"],
	"fno-delete-null-pointer-checks">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not treat usage of null pointers as undefined behavior.">;

	def frewrite_map_file : Separate<["-"], "frewrite-map-file">,
	Group<f_Group>,
	Flags<[ DriverOption, CC1Option ]>;
	def frewrite_map_file_EQ : Joined<["-"], "frewrite-map-file=">,
	Group<f_Group>,
	Flags<[DriverOption]>;

	def fuse_line_directives : Flag<["-"], "fuse-line-directives">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_use_line_directives : Flag<["-"], "fno-use-line-directives">, Group<f_Group>;

	def ffreestanding : Flag<["-"], "ffreestanding">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assert that the compilation takes place in a freestanding environment">;
	def fgnuc_version_EQ : Joined<["-"], "fgnuc-version=">, Group<f_Group>,
	HelpText<"Sets various macros to claim compatibility with the given GCC version (default is 4.2.1)">,
	Flags<[CC1Option, CoreOption]>;
	def fgnu_keywords : Flag<["-"], "fgnu-keywords">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow GNU-extension keywords regardless of language standard">;
	def fgnu89_inline : Flag<["-"], "fgnu89-inline">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the gnu89 inline semantics">;
	def fno_gnu89_inline : Flag<["-"], "fno-gnu89-inline">, Group<f_Group>;
	def fgnu_runtime : Flag<["-"], "fgnu-runtime">, Group<f_Group>,
	HelpText<"Generate output compatible with the standard GNU Objective-C runtime">;
	def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, Flags<[CC1Option]>;
	def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>,
	Group<Link_Group>;
	def : Flag<["-"], "findirect-virtual-calls">, Alias<fapple_kext>;
	def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline suitable functions">;
	def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline functions which are (explicitly or implicitly) marked inline">;
	def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
	def fexperimental_isel : Flag<["-"], "fexperimental-isel">, Group<f_clang_Group>,
	HelpText<"Enables the experimental global instruction selector">;
	def fexperimental_new_pass_manager : Flag<["-"], "fexperimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enables an experimental new pass manager in LLVM.">;
	def finput_charset_EQ : Joined<["-"], "finput-charset=">, Group<f_Group>;
	def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
	def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Generate calls to instrument function entry and exit">;
	def finstrument_functions_after_inlining : Flag<["-"], "finstrument-functions-after-inlining">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Like -finstrument-functions, but insert the calls after inlining">;
	def finstrument_function_entry_bare : Flag<["-"], "finstrument-function-entry-bare">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Instrument function entry only, after inlining, without arguments to the instrumentation call">;
	def fcf_protection_EQ : Joined<["-"], "fcf-protection=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Instrument control-flow architecture protection. Options: return, branch, full, none.">, Values<"return,branch,full,none">;
	def fcf_protection : Flag<["-"], "fcf-protection">, Group<f_Group>, Flags<[CoreOption, CC1Option]>,
	Alias<fcf_protection_EQ>, AliasArgs<["full"]>,
	HelpText<"Enable cf-protection in 'full' mode">;

	def fxray_instrument : Flag<["-"], "fxray-instrument">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Generate XRay instrumentation sleds on function entry and exit">;
	def fnoxray_instrument : Flag<["-"], "fno-xray-instrument">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_instruction_threshold_EQ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Sets the minimum function size to instrument with XRay">;
	def fxray_instruction_threshold_ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fxray_always_instrument :
	JoinedOrSeparate<["-"], "fxray-always-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"DEPRECATED: Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.">;
	def fxray_never_instrument :
	JoinedOrSeparate<["-"], "fxray-never-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"DEPRECATED: Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.">;
	def fxray_attr_list :
	JoinedOrSeparate<["-"], "fxray-attr-list=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Filename defining the list of functions/types for imbuing XRay attributes.">;
	def fxray_modes :
	JoinedOrSeparate<["-"], "fxray-modes=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"List of modes to link in by default into XRay instrumented binaries.">;

	def fxray_always_emit_customevents : Flag<["-"], "fxray-always-emit-customevents">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Determine whether to always emit __xray_customevent(...) calls even if the function it appears in is not always instrumented.">;
	def fnoxray_always_emit_customevents : Flag<["-"], "fno-xray-always-emit-customevents">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_always_emit_typedevents : Flag<["-"], "fxray-always-emit-typedevents">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Determine whether to always emit __xray_typedevent(...) calls even if the function it appears in is not always instrumented.">;
	def fnoxray_always_emit_typedevents : Flag<["-"], "fno-xray-always-emit-typedevents">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_link_deps : Flag<["-"], "fxray-link-deps">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Tells clang to add the link dependencies for XRay.">;
	def fnoxray_link_deps : Flag<["-"], "fnoxray-link-deps">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_instrumentation_bundle :
	JoinedOrSeparate<["-"], "fxray-instrumentation-bundle=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Select which XRay instrumentation points to emit. Options: all, none, function, custom. Default is 'all'.">;

	def ffine_grained_bitfield_accesses : Flag<["-"],
	"ffine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Use separate accesses for consecutive bitfield runs with legal widths and alignments.">;
	def fno_fine_grained_bitfield_accesses : Flag<["-"],
	"fno-fine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Use large-integer access for consecutive bitfield runs.">;

	def flat__namespace : Flag<["-"], "flat_namespace">;
	def flax_vector_conversions_EQ : Joined<["-"], "flax-vector-conversions=">, Group<f_Group>,
	HelpText<"Enable implicit vector bit-casts">, Values<"none,integer,all">, Flags<[CC1Option]>;
	def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>,
	Alias<flax_vector_conversions_EQ>, AliasArgs<["integer"]>;
	def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
	def fapple_link_rtlib : Flag<["-"], "fapple-link-rtlib">, Group<f_Group>,
	HelpText<"Force linking the clang builtins runtime library">;
	def flto_EQ : Joined<["-"], "flto=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Set LTO mode to either 'full' or 'thin'">, Values<"thin,full">;
	def flto : Flag<["-"], "flto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Enable LTO in 'full' mode">;
	def fno_lto : Flag<["-"], "fno-lto">, Group<f_Group>,
	HelpText<"Disable LTO mode (default)">;
	def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
	Flags<[CC1Option]>, Group<f_Group>,
	HelpText<"Controls the backend parallelism of -flto=thin (default "
	"of 0 means the number of threads will be derived from "
	"the number of CPUs detected)">;
	def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
	Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Perform ThinLTO importing using provided function summary index">;
	def fthin_link_bitcode_EQ : Joined<["-"], "fthin-link-bitcode=">,
	Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Write minimized bitcode to <file> for the ThinLTO thin link only">;
	def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
	Group<f_Group>, Flags<[DriverOption, CoreOption]>;
	def fmerge_all_constants : Flag<["-"], "fmerge-all-constants">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Allow merging of constants">;
	def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>;
	def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">;
	def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Enable full Microsoft Visual C++ compatibility">;
	def fms_volatile : Flag<["-"], "fms-volatile">, Group<f_Group>, Flags<[CC1Option]>;
	def fmsc_version : Joined<["-"], "fmsc-version=">, Group<f_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))">;
	def fms_compatibility_version
	: Joined<["-"], "fms-compatibility-version=">,
	Group<f_Group>,
	Flags<[ CC1Option, CoreOption ]>,
	HelpText<"Dot-separated value representing the Microsoft compiler "
	"version number to report in _MSC_VER (0 = don't define it "
	"(default))">;
	def fdelayed_template_parsing : Flag<["-"], "fdelayed-template-parsing">, Group<f_Group>,
	HelpText<"Parse templated function definitions at the end of the "
	"translation unit">, Flags<[CC1Option, CoreOption]>;
	def fms_memptr_rep_EQ : Joined<["-"], "fms-memptr-rep=">, Group<f_Group>, Flags<[CC1Option]>;
	def fmodules_cache_path : Joined<["-"], "fmodules-cache-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module cache path">;
	def fmodules_user_build_path : Separate<["-"], "fmodules-user-build-path">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module user build path">;
	def fprebuilt_module_path : Joined<["-"], "fprebuilt-module-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the prebuilt module path">;
	def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">;
	def fmodules_prune_after : Joined<["-"], "fmodules-prune-after=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) after which a module file will be considered unused">;
	def fmodules_search_all : Flag <["-"], "fmodules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Search even non-imported modules to resolve references">;
	def fbuild_session_timestamp : Joined<["-"], "fbuild-session-timestamp=">,
	Group<i_Group>, Flags<[CC1Option]>, MetaVarName<"<time since Epoch in seconds>">,
	HelpText<"Time when the current build session started">;
	def fbuild_session_file : Joined<["-"], "fbuild-session-file=">,
	Group<i_Group>, MetaVarName<"<file>">,
	HelpText<"Use the last modification time of <file> as the build session timestamp">;
	def fmodules_validate_once_per_build_session : Flag<["-"], "fmodules-validate-once-per-build-session">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Don't verify input files for the modules if the module has been "
	"successfully validated or loaded during this build session">;
	def fmodules_disable_diagnostic_validation : Flag<["-"], "fmodules-disable-diagnostic-validation">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Disable validation of the diagnostic options when loading the module">;
	def fmodules_validate_system_headers : Flag<["-"], "fmodules-validate-system-headers">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Validate the system headers that a module depends on when loading the module">;
	def fno_modules_validate_system_headers : Flag<["-"], "fno-modules-validate-system-headers">,
	Group<i_Group>, Flags<[DriverOption]>;

	def fvalidate_ast_input_files_content:
	Flag <["-"], "fvalidate-ast-input-files-content">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Compute and store the hash of input files used to build an AST."
	" Files with mismatching mtime's are considered valid"
	" if both contents is identical">;
	def fmodules_validate_input_files_content:
	Flag <["-"], "fmodules-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Validate PCM input files based on content if mtime differs">;
	def fno_modules_validate_input_files_content:
	Flag <["-"], "fno_modules-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>;
	def fpch_validate_input_files_content:
	Flag <["-"], "fpch-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Validate PCH input files based on content if mtime differs">;
	def fno_pch_validate_input_files_content:
	Flag <["-"], "fno_pch-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>;

	def fmodules : Flag <["-"], "fmodules">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable the 'modules' language feature">;
	def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Implicitly search the file system for module map files.">;
	def fmodules_ts : Flag <["-"], "fmodules-ts">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable support for the C++ Modules TS">;
	def fmodule_maps : Flag <["-"], "fmodule-maps">, Alias<fimplicit_module_maps>;
	def fmodule_name_EQ : Joined<["-"], "fmodule-name=">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>, MetaVarName<"<name>">,
	HelpText<"Specify the name of the module to build">;
	def fmodule_name : Separate<["-"], "fmodule-name">, Alias<fmodule_name_EQ>;
	def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
	Flags<[CC1Option]>, Alias<fmodule_name_EQ>;
	def fmodule_map_file : Joined<["-"], "fmodule-map-file=">,
	Group<f_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"<file>">,
	HelpText<"Load this module map file">;
	def fmodule_file : Joined<["-"], "fmodule-file=">,
	Group<i_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"[<name>=]<file>">,
	HelpText<"Specify the mapping of module name to precompiled module file, or load a module file if name is omitted.">;
	def fmodules_ignore_macro : Joined<["-"], "fmodules-ignore-macro=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Ignore the definition of the given macro when building and loading modules">;
	def fmodules_decluse : Flag <["-"], "fmodules-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Require declaration of modules used within a module">;
	def fmodules_strict_decluse : Flag <["-"], "fmodules-strict-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Like -fmodules-decluse but requires all headers to be in modules">;
	def fno_modules_search_all : Flag <["-"], "fno-modules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>;
	def fno_implicit_modules :
	Flag <["-"], "fno-implicit-modules">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>;
	def fretain_comments_from_system_headers : Flag<["-"], "fretain-comments-from-system-headers">, Group<f_Group>, Flags<[CC1Option]>;

	def fmudflapth : Flag<["-"], "fmudflapth">, Group<f_Group>;
	def fmudflap : Flag<["-"], "fmudflap">, Group<f_Group>;
	def fnested_functions : Flag<["-"], "fnested-functions">, Group<f_Group>;
	def fnext_runtime : Flag<["-"], "fnext-runtime">, Group<f_Group>;
	def fno_access_control : Flag<["-"], "fno-access-control">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable C++ access control">;
	def fno_apple_pragma_pack : Flag<["-"], "fno-apple-pragma-pack">, Group<f_Group>;
	def fno_asm : Flag<["-"], "fno-asm">, Group<f_Group>;
	def fno_asynchronous_unwind_tables : Flag<["-"], "fno-asynchronous-unwind-tables">, Group<f_Group>;
	def fno_assume_sane_operator_new : Flag<["-"], "fno-assume-sane-operator-new">, Group<f_Group>,
	HelpText<"Don't assume that C++'s global operator new can't alias any pointer">,
	Flags<[CC1Option]>;
	def fno_blocks : Flag<["-"], "fno-blocks">, Group<f_Group>, Flags<[CoreOption]>;
	def fno_borland_extensions : Flag<["-"], "fno-borland-extensions">, Group<f_Group>;
	def fno_builtin : Flag<["-"], "fno-builtin">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable implicit builtin knowledge of functions">;
	def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable implicit builtin knowledge of a specific function">;
	def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>;
	def fno_diagnostics_color : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fno_common : Flag<["-"], "fno-common">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Compile common globals like normal definitions">;
	def fno_constant_cfstrings : Flag<["-"], "fno-constant-cfstrings">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Disable creation of CodeFoundation-type constant strings">;
	def fno_cxx_exceptions: Flag<["-"], "fno-cxx-exceptions">, Group<f_Group>;
	def fno_cxx_modules : Flag <["-"], "fno-cxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_diagnostics_fixit_info : Flag<["-"], "fno-diagnostics-fixit-info">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include fixit information in diagnostics">;
	def fno_diagnostics_show_hotness : Flag<["-"], "fno-diagnostics-show-hotness">, Group<f_Group>;
	def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>;
	def fno_diagnostics_show_note_include_stack : Flag<["-"], "fno-diagnostics-show-note-include-stack">,
	Flags<[CC1Option]>, Group<f_Group>;
	def fdigraphs : Flag<["-"], "fdigraphs">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:' (default)">;
	def fno_digraphs : Flag<["-"], "fno-digraphs">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disallow alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:'">;
	def fno_declspec : Flag<["-"], "fno-declspec">, Group<f_clang_Group>,
	HelpText<"Disallow __declspec as a keyword">, Flags<[CC1Option]>;
	def fno_dollars_in_identifiers : Flag<["-"], "fno-dollars-in-identifiers">, Group<f_Group>,
	HelpText<"Disallow '$' in identifiers">, Flags<[CC1Option]>;
	def fno_elide_constructors : Flag<["-"], "fno-elide-constructors">, Group<f_Group>,
	HelpText<"Disable C++ copy constructor elision">, Flags<[CC1Option]>;
	def fno_eliminate_unused_debug_symbols : Flag<["-"], "fno-eliminate-unused-debug-symbols">, Group<f_Group>;
	def fno_exceptions : Flag<["-"], "fno-exceptions">, Group<f_Group>;
	def fno_gnu_keywords : Flag<["-"], "fno-gnu-keywords">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_inline_functions : Flag<["-"], "fno-inline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_inline : Flag<["-"], "fno-inline">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_Group>,
	HelpText<"Disables the experimental global instruction selector">;
	def fno_experimental_new_pass_manager : Flag<["-"], "fno-experimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Disables an experimental new pass manager in LLVM.">;
	def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the given vector functions library">, Values<"Accelerate,MASSV,SVML,none">;
	def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,
	Alias<flax_vector_conversions_EQ>, AliasArgs<["none"]>;
	def fno_merge_all_constants : Flag<["-"], "fno-merge-all-constants">, Group<f_Group>,
	HelpText<"Disallow merging of constants">;
	def fno_modules : Flag <["-"], "fno-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_implicit_module_maps : Flag <["-"], "fno-implicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_maps : Flag <["-"], "fno-module-maps">, Alias<fno_implicit_module_maps>;
	def fno_modules_decluse : Flag <["-"], "fno-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_modules_strict_decluse : Flag <["-"], "fno-strict-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fimplicit_modules : Flag <["-"], "fimplicit-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fmodule_file_deps : Flag <["-"], "fmodule-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_file_deps : Flag <["-"], "fno-module-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_ms_extensions : Flag<["-"], "fno-ms-extensions">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_ms_compatibility : Flag<["-"], "fno-ms-compatibility">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_delayed_template_parsing : Flag<["-"], "fno-delayed-template-parsing">, Group<f_Group>,
	HelpText<"Disable delayed template parsing">,
	Flags<[DriverOption, CoreOption]>;
	def fno_objc_exceptions: Flag<["-"], "fno-objc-exceptions">, Group<f_Group>;
	def fno_objc_legacy_dispatch : Flag<["-"], "fno-objc-legacy-dispatch">, Group<f_Group>;
	def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>;
	def fno_operator_names : Flag<["-"], "fno-operator-names">, Group<f_Group>,
	HelpText<"Do not treat C++ operator name keywords as synonyms for operators">,
	Flags<[CC1Option]>;
	def fno_pascal_strings : Flag<["-"], "fno-pascal-strings">, Group<f_Group>;
	def fno_rtti : Flag<["-"], "fno-rtti">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable generation of rtti information">;
	def fno_rtti_data : Flag<["-"], "fno-rtti-data">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Control emission of RTTI data">;
	def fno_short_enums : Flag<["-"], "fno-short-enums">, Group<f_Group>;
	def fno_show_column : Flag<["-"], "fno-show-column">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not include column number on diagnostics">;
	def fno_show_source_location : Flag<["-"], "fno-show-source-location">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include source location information with diagnostics">;
	def fdiagnostics_absolute_paths : Flag<["-"], "fdiagnostics-absolute-paths">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Print absolute paths in diagnostics">;
	def fno_spell_checking : Flag<["-"], "fno-spell-checking">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Disable spell-checking">;
	def fno_stack_protector : Flag<["-"], "fno-stack-protector">, Group<f_Group>,
	HelpText<"Disable the use of stack protectors">;
	def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
	def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
	def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
	def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">,
	Group<f_Group>;
	def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
	def fno_temp_file : Flag<["-"], "fno-temp-file">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<
	"Directly create compilation output files. This may lead to incorrect incremental builds if the compiler crashes">;
	def fno_threadsafe_statics : Flag<["-"], "fno-threadsafe-statics">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not emit code to make initialization of local statics thread safe">;
	def fno_use_cxa_atexit : Flag<["-"], "fno-use-cxa-atexit">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use __cxa_atexit for calling destructors">;
	def fno_register_global_dtors_with_atexit : Flag<["-"], "fno-register-global-dtors-with-atexit">, Group<f_Group>,
	HelpText<"Don't use atexit or __cxa_atexit to register global destructors">;
	def fno_use_init_array : Flag<["-"], "fno-use-init-array">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use .init_array instead of .ctors">;
	def fno_unit_at_a_time : Flag<["-"], "fno-unit-at-a-time">, Group<f_Group>;
	def fno_unwind_tables : Flag<["-"], "fno-unwind-tables">, Group<f_Group>;
	def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>;
	def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
	def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>;
	def fno_zero_initialized_in_bss : Flag<["-"], "fno-zero-initialized-in-bss">, Group<f_Group>;
	def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Synthesize retain and release calls for Objective-C pointers">;
	def fno_objc_arc : Flag<["-"], "fno-objc-arc">, Group<f_Group>;
	def fobjc_convert_messages_to_runtime_calls :
	Flag<["-"], "fobjc-convert-messages-to-runtime-calls">, Group<f_Group>;
	def fno_objc_convert_messages_to_runtime_calls :
	Flag<["-"], "fno-objc-convert-messages-to-runtime-calls">, Group<f_Group>, Flags<[CC1Option]>;
	def fobjc_arc_exceptions : Flag<["-"], "fobjc-arc-exceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use EH-safe code when synthesizing retains and releases in -fobjc-arc">;
	def fno_objc_arc_exceptions : Flag<["-"], "fno-objc-arc-exceptions">, Group<f_Group>;
	def fobjc_atdefs : Flag<["-"], "fobjc-atdefs">, Group<clang_ignored_f_Group>;
	def fobjc_call_cxx_cdtors : Flag<["-"], "fobjc-call-cxx-cdtors">, Group<clang_ignored_f_Group>;
	def fobjc_exceptions: Flag<["-"], "fobjc-exceptions">, Group<f_Group>,
	HelpText<"Enable Objective-C exceptions">, Flags<[CC1Option]>;
	def fapplication_extension : Flag<["-"], "fapplication-extension">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Restrict code to those available for App Extensions">;
	def fno_application_extension : Flag<["-"], "fno-application-extension">,
	Group<f_Group>;
	def frelaxed_template_template_args : Flag<["-"], "frelaxed-template-template-args">,
	Flags<[CC1Option]>, HelpText<"Enable C++17 relaxed template template argument matching">,
	Group<f_Group>;
	def fno_relaxed_template_template_args : Flag<["-"], "fno-relaxed-template-template-args">,
	Group<f_Group>;
	def fsized_deallocation : Flag<["-"], "fsized-deallocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++14 sized global deallocation functions">, Group<f_Group>;
	def fno_sized_deallocation: Flag<["-"], "fno-sized-deallocation">, Group<f_Group>;
	def faligned_allocation : Flag<["-"], "faligned-allocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++17 aligned allocation functions">, Group<f_Group>;
	def fno_aligned_allocation: Flag<["-"], "fno-aligned-allocation">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fnew_alignment_EQ : Joined<["-"], "fnew-alignment=">,
	HelpText<"Specifies the largest alignment guaranteed by '::operator new(size_t)'">,
	MetaVarName<"<align>">, Group<f_Group>, Flags<[CC1Option]>;
	def : Separate<["-"], "fnew-alignment">, Alias<fnew_alignment_EQ>;
	def : Flag<["-"], "faligned-new">, Alias<faligned_allocation>;
	def : Flag<["-"], "fno-aligned-new">, Alias<fno_aligned_allocation>;
	def faligned_new_EQ : Joined<["-"], "faligned-new=">;

	def fobjc_legacy_dispatch : Flag<["-"], "fobjc-legacy-dispatch">, Group<f_Group>;
	def fobjc_new_property : Flag<["-"], "fobjc-new-property">, Group<clang_ignored_f_Group>;
	def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">,
	Group<f_Group>;
	def fno_objc_infer_related_result_type : Flag<["-"],
	"fno-objc-infer-related-result-type">, Group<f_Group>,
	HelpText<
	"do not infer Objective-C related result type based on method family">,
	Flags<[CC1Option]>;
	def fobjc_link_runtime: Flag<["-"], "fobjc-link-runtime">, Group<f_Group>;
	def fobjc_weak : Flag<["-"], "fobjc-weak">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable ARC-style weak references in Objective-C">;

	// Objective-C ABI options.
	def fobjc_runtime_EQ : Joined<["-"], "fobjc-runtime=">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Specify the target Objective-C runtime kind and version">;
	def fobjc_abi_version_EQ : Joined<["-"], "fobjc-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi_version_EQ : Joined<["-"], "fobjc-nonfragile-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi : Flag<["-"], "fobjc-nonfragile-abi">, Group<f_Group>;
	def fno_objc_nonfragile_abi : Flag<["-"], "fno-objc-nonfragile-abi">, Group<f_Group>;

	def fobjc_sender_dependent_dispatch : Flag<["-"], "fobjc-sender-dependent-dispatch">, Group<f_Group>;
	def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
	def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Parse OpenMP pragmas and generate parallel code.">;
	def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
	def fopenmp_version_EQ : Joined<["-"], "fopenmp-version=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_EQ : Joined<["-"], "fopenmp=">, Group<f_Group>;
	def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
	def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">,
	Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">,
	Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
	def fopenmp_enable_irbuilder : Flag<["-"], "fopenmp-enable-irbuilder">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
	HelpText<"Use the experimental OpenMP-IR-Builder codegen path.">;
	def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_cuda_mode : Flag<["-"], "fopenmp-cuda-mode">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_force_full_runtime : Flag<["-"], "fopenmp-cuda-force-full-runtime">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fno_openmp_cuda_force_full_runtime : Flag<["-"], "fno-openmp-cuda-force-full-runtime">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_number_of_sm_EQ : Joined<["-"], "fopenmp-cuda-number-of-sm=">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_teams_reduction_recs_num_EQ : Joined<["-"], "fopenmp-cuda-teams-reduction-recs-num=">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def static_openmp: Flag<["-"], "static-openmp">,
	HelpText<"Use the static host OpenMP runtime while linking.">;
	def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
	def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
	def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group<f_Group>, Flags<[CC1Option]>;
	def fescaping_block_tail_calls : Flag<["-"], "fescaping-block-tail-calls">, Group<f_Group>;
	def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;
	def force__flat__namespace : Flag<["-"], "force_flat_namespace">;
	def force__load : Separate<["-"], "force_load">;
	def force_addr : Joined<["-"], "fforce-addr">, Group<clang_ignored_f_Group>;
	def foutput_class_dir_EQ : Joined<["-"], "foutput-class-dir=">, Group<f_Group>;
	def fpack_struct : Flag<["-"], "fpack-struct">, Group<f_Group>;
	def fno_pack_struct : Flag<["-"], "fno-pack-struct">, Group<f_Group>;
	def fpack_struct_EQ : Joined<["-"], "fpack-struct=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the default maximum struct packing alignment">;
	def fmax_type_align_EQ : Joined<["-"], "fmax-type-align=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the maximum alignment to enforce on pointers lacking an explicit alignment">;
	def fno_max_type_align : Flag<["-"], "fno-max-type-align">, Group<f_Group>;
	def fpascal_strings : Flag<["-"], "fpascal-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Recognize and construct Pascal-style string literals">;
	def fpatchable_function_entry_EQ : Joined<["-"], "fpatchable-function-entry=">, Group<f_Group>, Flags<[CC1Option]>,
	MetaVarName<"<N,M>">, HelpText<"Generate M NOPs before function entry and N-M NOPs after function entry">;
	def fpcc_struct_return : Flag<["-"], "fpcc-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return all structs on the stack">;
	def fpch_preprocess : Flag<["-"], "fpch-preprocess">, Group<f_Group>;
	def fpic : Flag<["-"], "fpic">, Group<f_Group>;
	def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
	def fpie : Flag<["-"], "fpie">, Group<f_Group>;
	def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
	def fplt : Flag<["-"], "fplt">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the PLT to make function calls">;
	def fno_plt : Flag<["-"], "fno-plt">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not use the PLT to make function calls">;
	def fropi : Flag<["-"], "fropi">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_ropi : Flag<["-"], "fno-ropi">, Group<f_Group>;
	def frwpi : Flag<["-"], "frwpi">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_rwpi : Flag<["-"], "fno-rwpi">, Group<f_Group>;
	def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<dsopath>">,
	HelpText<"Load the named plugin (dynamic shared object)">;
	def fpass_plugin_EQ : Joined<["-"], "fpass-plugin=">,
	Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<dsopath>">,
	HelpText<"Load pass plugin from a dynamic shared object file (only with new pass manager).">;
	def fpreserve_as_comments : Flag<["-"], "fpreserve-as-comments">, Group<f_Group>;
	def fno_preserve_as_comments : Flag<["-"], "fno-preserve-as-comments">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not preserve comments in inline assembly">;
	def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group<f_Group>;
	def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group<f_Group>;
	def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>;
	def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group<clang_ignored_f_Group>;
	def freg_struct_return : Flag<["-"], "freg-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return small structs in registers">;
	def frtti : Flag<["-"], "frtti">, Group<f_Group>;
	def : Flag<["-"], "fsched-interblock">, Group<clang_ignored_f_Group>;
	def fshort_enums : Flag<["-"], "fshort-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allocate to an enum type only as many bytes as it needs for the declared range of possible values">;
	def fchar8__t : Flag<["-"], "fchar8_t">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable C++ builtin type char8_t">;
	def fno_char8__t : Flag<["-"], "fno-char8_t">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable C++ builtin type char8_t">;
	def fshort_wchar : Flag<["-"], "fshort-wchar">, Group<f_Group>,
	HelpText<"Force wchar_t to be a short unsigned int">;
	def fno_short_wchar : Flag<["-"], "fno-short-wchar">, Group<f_Group>,
	HelpText<"Force wchar_t to be an unsigned int">;
	def fshow_overloads_EQ : Joined<["-"], "fshow-overloads=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Which overload candidates to show when overload resolution fails: "
	"best\|all; defaults to all">, Values<"best,all">;
	def fshow_column : Flag<["-"], "fshow-column">, Group<f_Group>, Flags<[CC1Option]>;
	def fshow_source_location : Flag<["-"], "fshow-source-location">, Group<f_Group>;
	def fspell_checking : Flag<["-"], "fspell-checking">, Group<f_Group>;
	def fspell_checking_limit_EQ : Joined<["-"], "fspell-checking-limit=">, Group<f_Group>;
	def fsigned_bitfields : Flag<["-"], "fsigned-bitfields">, Group<f_Group>;
	def fsigned_char : Flag<["-"], "fsigned-char">, Group<f_Group>;
	def fno_signed_char : Flag<["-"], "fno-signed-char">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Char is unsigned">;
	def fsplit_stack : Flag<["-"], "fsplit-stack">, Group<f_Group>;
	def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
	HelpText<"Enable stack protectors for all functions">;
	def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
	HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. "
	"Compared to -fstack-protector, this uses a stronger heuristic "
	"that includes functions containing arrays of any size (and any type), "
	"as well as any calls to alloca or the taking of an address from a local variable">;
	def fstack_protector : Flag<["-"], "fstack-protector">, Group<f_Group>,
	HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. "
	"This uses a loose heuristic which considers functions vulnerable "
	"if they contain a char (or 8bit integer) array or constant sized calls to "
	"alloca, which are of greater size than ssp-buffer-size (default: 8 bytes). "
	"All variable sized calls to alloca are considered vulnerable">;
	def ftrivial_auto_var_init : Joined<["-"], "ftrivial-auto-var-init=">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Initialize trivial automatic stack variables: uninitialized (default)"
	" \| pattern">, Values<"uninitialized,pattern">;
	def enable_trivial_var_init_zero : Flag<["-"], "enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang">,
	Flags<[CC1Option, CoreOption]>,
	HelpText<"Trivial automatic variable initialization to zero is only here for benchmarks, it'll eventually be removed, and I'm OK with that because I'm only using it to benchmark">;
	def fstandalone_debug : Flag<["-"], "fstandalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit full debug info for all types used by the program">;
	def fno_standalone_debug : Flag<["-"], "fno-standalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Limit debug information produced to reduce size of debug binary">;
	def flimit_debug_info : Flag<["-"], "flimit-debug-info">, Flags<[CoreOption]>, Alias<fno_standalone_debug>;
	def fno_limit_debug_info : Flag<["-"], "fno-limit-debug-info">, Flags<[CoreOption]>, Alias<fstandalone_debug>;
	def fdebug_macro : Flag<["-"], "fdebug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit macro debug information">;
	def fno_debug_macro : Flag<["-"], "fno-debug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Do not emit macro debug information">;
	def fstrict_aliasing : Flag<["-"], "fstrict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstrict_enums : Flag<["-"], "fstrict-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict definition of an enum's "
	"value range">;
	def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict rules for overwriting "
	"polymorphic C++ objects">;
	def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>;
	def fsyntax_only : Flag<["-"], "fsyntax-only">,
	Flags<[DriverOption,CoreOption,CC1Option]>, Group<Action_Group>;
	def ftabstop_EQ : Joined<["-"], "ftabstop=">, Group<f_Group>;
	def ftemplate_depth_EQ : Joined<["-"], "ftemplate-depth=">, Group<f_Group>;
	def ftemplate_depth_ : Joined<["-"], "ftemplate-depth-">, Group<f_Group>;
	def ftemplate_backtrace_limit_EQ : Joined<["-"], "ftemplate-backtrace-limit=">,
	Group<f_Group>;
	def foperator_arrow_depth_EQ : Joined<["-"], "foperator-arrow-depth=">,
	Group<f_Group>;

	def fsave_optimization_record : Flag<["-"], "fsave-optimization-record">,
	Group<f_Group>, HelpText<"Generate a YAML optimization record file">;
	def fsave_optimization_record_EQ : Joined<["-"], "fsave-optimization-record=">,
	Group<f_Group>, HelpText<"Generate an optimization record file in a specific format">,
	MetaVarName<"<format>">;
	def fno_save_optimization_record : Flag<["-"], "fno-save-optimization-record">,
	Group<f_Group>, Flags<[NoArgumentUnused]>;
	def foptimization_record_file_EQ : Joined<["-"], "foptimization-record-file=">,
	Group<f_Group>,
	HelpText<"Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.">,
	MetaVarName<"<file>">;
	def foptimization_record_passes_EQ : Joined<["-"], "foptimization-record-passes=">,
	Group<f_Group>,
	HelpText<"Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)">,
	MetaVarName<"<regex>">;

	def ftest_coverage : Flag<["-"], "ftest-coverage">, Group<f_Group>;
	def fvectorize : Flag<["-"], "fvectorize">, Group<f_Group>,
	HelpText<"Enable the loop vectorization passes">;
	def fno_vectorize : Flag<["-"], "fno-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-vectorize">, Alias<fvectorize>;
	def : Flag<["-"], "fno-tree-vectorize">, Alias<fno_vectorize>;
	def fslp_vectorize : Flag<["-"], "fslp-vectorize">, Group<f_Group>,
	HelpText<"Enable the superword-level parallelism vectorization passes">;
	def fno_slp_vectorize : Flag<["-"], "fno-slp-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-slp-vectorize">, Alias<fslp_vectorize>;
	def : Flag<["-"], "fno-tree-slp-vectorize">, Alias<fno_slp_vectorize>;
	def Wlarge_by_value_copy_def : Flag<["-"], "Wlarge-by-value-copy">,
	HelpText<"Warn if a function definition returns or accepts an object larger "
	"in bytes than a given value">, Flags<[HelpHidden]>;
	def Wlarge_by_value_copy_EQ : Joined<["-"], "Wlarge-by-value-copy=">, Flags<[CC1Option]>;

	// These "special" warning flags are effectively processed as f_Group flags by the driver:
	// Just silence warnings about -Wlarger-than for now.
	def Wlarger_than_EQ : Joined<["-"], "Wlarger-than=">, Group<clang_ignored_f_Group>;
	def Wlarger_than_ : Joined<["-"], "Wlarger-than-">, Alias<Wlarger_than_EQ>;
	def Wframe_larger_than_EQ : Joined<["-"], "Wframe-larger-than=">, Group<f_Group>, Flags<[DriverOption]>;

	def : Flag<["-"], "fterminated-vtables">, Alias<fapple_kext>;
	def fthreadsafe_statics : Flag<["-"], "fthreadsafe-statics">, Group<f_Group>;
	def ftime_report : Flag<["-"], "ftime-report">, Group<f_Group>, Flags<[CC1Option]>;
	def ftime_trace : Flag<["-"], "ftime-trace">, Group<f_Group>,
	HelpText<"Turn on time profiler. Generates JSON file based on output filename.">,
	DocBrief<[{
	Turn on time profiler. Generates JSON file based on output filename. Results
	can be analyzed with chrome://tracing or `Speedscope App
	<https://www.speedscope.app>`_ for flamegraph visualization.}]>,
	Flags<[CC1Option, CoreOption]>;
	def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Group<f_Group>,
	HelpText<"Minimum time granularity (in microseconds) traced by time profiler">,
	Flags<[CC1Option, CoreOption]>;
	def ftlsmodel_EQ : Joined<["-"], "ftls-model=">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrapv : Flag<["-"], "ftrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Trap on integer overflow">;
	def ftrapv_handler_EQ : Joined<["-"], "ftrapv-handler=">, Group<f_Group>,
	MetaVarName<"<function name>">,
	HelpText<"Specify the function to be called on overflow">;
	def ftrapv_handler : Separate<["-"], "ftrapv-handler">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrap_function_EQ : Joined<["-"], "ftrap-function=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Issue call to specified function rather than a trap instruction">;
	def funit_at_a_time : Flag<["-"], "funit-at-a-time">, Group<f_Group>;
	def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop unroller">, Flags<[CC1Option]>;
	def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop unroller">, Flags<[CC1Option]>;
	def freroll_loops : Flag<["-"], "freroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop reroller">, Flags<[CC1Option]>;
	def fno_reroll_loops : Flag<["-"], "fno-reroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop reroller">;
	def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
	HelpText<"Process trigraph sequences">, Flags<[CC1Option]>;
	def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
	HelpText<"Do not process trigraph sequences">, Flags<[CC1Option]>;
	def funsigned_bitfields : Flag<["-"], "funsigned-bitfields">, Group<f_Group>;
	def funsigned_char : Flag<["-"], "funsigned-char">, Group<f_Group>;
	def fno_unsigned_char : Flag<["-"], "fno-unsigned-char">;
	def funwind_tables : Flag<["-"], "funwind-tables">, Group<f_Group>;
	def fuse_cxa_atexit : Flag<["-"], "fuse-cxa-atexit">, Group<f_Group>;
	def fregister_global_dtors_with_atexit : Flag<["-"], "fregister-global-dtors-with-atexit">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use atexit or __cxa_atexit to register global destructors">;
	def fuse_init_array : Flag<["-"], "fuse-init-array">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use .init_array instead of .ctors">;
	def fno_var_tracking : Flag<["-"], "fno-var-tracking">, Group<clang_ignored_f_Group>;
	def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>;
	def dA : Flag<["-"], "dA">, Alias<fverbose_asm>;
	def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
	HelpText<"Set the default symbol visibility for all global declarations">, Values<"hidden,default">;
	def fvisibility_inlines_hidden : Flag<["-"], "fvisibility-inlines-hidden">, Group<f_Group>,
	HelpText<"Give inline C++ member functions hidden visibility by default">,
	Flags<[CC1Option]>;
	def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>,
	HelpText<"Give global types 'default' visibility and global functions and "
	"variables 'hidden' visibility by default">;
	def fvisibility_global_new_delete_hidden : Flag<["-"], "fvisibility-global-new-delete-hidden">, Group<f_Group>,
	HelpText<"Give global C++ operator new and delete declarations hidden visibility">, Flags<[CC1Option]>;
	def fwhole_program_vtables : Flag<["-"], "fwhole-program-vtables">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>,
	HelpText<"Enables whole-program vtable optimization. Requires -flto">;
	def fno_whole_program_vtables : Flag<["-"], "fno-whole-program-vtables">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fsplit_lto_unit : Flag<["-"], "fsplit-lto-unit">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>,
	HelpText<"Enables splitting of the LTO unit.">;
	def fno_split_lto_unit : Flag<["-"], "fno-split-lto-unit">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fforce_emit_vtables : Flag<["-"], "fforce-emit-vtables">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Emits more virtual tables to improve devirtualization">;
	def fno_force_emit_vtables : Flag<["-"], "fno-force-emit-vtables">, Group<f_Group>,
	Flags<[CoreOption]>;

	def fvirtual_function_elimination : Flag<["-"], "fvirtual-function-elimination">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>,
	HelpText<"Enables dead virtual function elimination optimization. Requires -flto=full">;
	def fno_virtual_function_elimination : Flag<["-"], "fno-virtual-function_elimination">, Group<f_Group>,
	Flags<[CoreOption]>;

	def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat signed integer overflow as two's complement">;
	def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Store string literals as writable data">;
	def fzero_initialized_in_bss : Flag<["-"], "fzero-initialized-in-bss">, Group<f_Group>;
	def ffunction_sections : Flag<["-"], "ffunction-sections">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Place each function in its own section (ELF Only)">;
	def fno_function_sections : Flag<["-"], "fno-function-sections">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fdata_sections : Flag <["-"], "fdata-sections">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place each data in its own section (ELF Only)">;
	def fno_data_sections : Flag <["-"], "fno-data-sections">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fstack_size_section : Flag<["-"], "fstack-size-section">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Emit section containing metadata on function stack sizes">;
	def fno_stack_size_section : Flag<["-"], "fno-stack-size-section">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't emit section containing metadata on function stack sizes">;

	def funique_section_names : Flag <["-"], "funique-section-names">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use unique names for text and data sections (ELF Only)">;
	def fno_unique_section_names : Flag <["-"], "fno-unique-section-names">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fstrict_return : Flag<["-"], "fstrict-return">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Always treat control flow paths that fall off the end of a "
	"non-void function as unreachable">;
	def fno_strict_return : Flag<["-"], "fno-strict-return">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fallow_editor_placeholders : Flag<["-"], "fallow-editor-placeholders">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat editor placeholders as valid source code">;
	def fno_allow_editor_placeholders : Flag<["-"],
	"fno-allow-editor-placeholders">, Group<f_Group>;

	def fdebug_types_section: Flag <["-"], "fdebug-types-section">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
	def fno_debug_types_section: Flag<["-"], "fno-debug-types-section">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fdebug_ranges_base_address: Flag <["-"], "fdebug-ranges-base-address">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use DWARF base address selection entries in debug_ranges">;
	def fno_debug_ranges_base_address: Flag <["-"], "fno-debug-ranges-base-address">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fsplit_dwarf_inlining: Flag <["-"], "fsplit-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Provide minimal debug info in the object/executable to facilitate online symbolication/stack traces in the absence of .dwo/.dwp files when using Split DWARF">;
	def fno_split_dwarf_inlining: Flag<["-"], "fno-split-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fdebug_default_version: Joined<["-"], "fdebug-default-version=">, Group<f_Group>,
	HelpText<"Default DWARF version to use, if a -g option caused DWARF debug info to be produced">;
	def fdebug_prefix_map_EQ
	: Joined<["-"], "fdebug-prefix-map=">, Group<f_Group>,
	Flags<[CC1Option,CC1AsOption]>,
	HelpText<"remap file source paths in debug info">;
	def ffile_prefix_map_EQ
	: Joined<["-"], "ffile-prefix-map=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"remap file source paths in debug info and predefined preprocessor macros">;
	def fmacro_prefix_map_EQ
	: Joined<["-"], "fmacro-prefix-map=">, Group<Preprocessor_Group>, Flags<[CC1Option]>,
	HelpText<"remap file source paths in predefined preprocessor macros">;
	def fforce_dwarf_frame : Flag<["-"], "fforce-dwarf-frame">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Always emit a debug frame section">;
	def fno_force_dwarf_frame : Flag<["-"], "fno-force-dwarf-frame">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't always emit a debug frame section">;
	def g_Flag : Flag<["-"], "g">, Group<g_Group>,
	HelpText<"Generate source-level debug information">;
	def gline_tables_only : Flag<["-"], "gline-tables-only">, Group<gN_Group>,
	Flags<[CoreOption]>, HelpText<"Emit debug line number tables only">;
	def gline_directives_only : Flag<["-"], "gline-directives-only">, Group<gN_Group>,
	Flags<[CoreOption]>, HelpText<"Emit debug line info directives only">;
	def gmlt : Flag<["-"], "gmlt">, Alias<gline_tables_only>;
	def g0 : Flag<["-"], "g0">, Group<gN_Group>;
	def g1 : Flag<["-"], "g1">, Group<gN_Group>, Alias<gline_tables_only>;
	def g2 : Flag<["-"], "g2">, Group<gN_Group>;
	def g3 : Flag<["-"], "g3">, Group<gN_Group>;
	def ggdb : Flag<["-"], "ggdb">, Group<gTune_Group>;
	def ggdb0 : Flag<["-"], "ggdb0">, Group<ggdbN_Group>;
	def ggdb1 : Flag<["-"], "ggdb1">, Group<ggdbN_Group>;
	def ggdb2 : Flag<["-"], "ggdb2">, Group<ggdbN_Group>;
	def ggdb3 : Flag<["-"], "ggdb3">, Group<ggdbN_Group>;
	def glldb : Flag<["-"], "glldb">, Group<gTune_Group>;
	def gsce : Flag<["-"], "gsce">, Group<gTune_Group>;
	// Equivalent to our default dwarf version. Forces usual dwarf emission when
	// CodeView is enabled.
	def gdwarf : Flag<["-"], "gdwarf">, Group<g_Group>, Flags<[CoreOption]>,
	HelpText<"Generate source-level debug information with the default dwarf version">;
	def gdwarf_2 : Flag<["-"], "gdwarf-2">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 2">;
	def gdwarf_3 : Flag<["-"], "gdwarf-3">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 3">;
	def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 4">;
	def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 5">;

	def gcodeview : Flag<["-"], "gcodeview">,
	HelpText<"Generate CodeView debug information">,
	Flags<[CC1Option, CC1AsOption, CoreOption]>;
	def gcodeview_ghash : Flag<["-"], "gcodeview-ghash">,
	HelpText<"Emit type record hashes in a .debug$H section">,
	Flags<[CC1Option, CoreOption]>;
	def gno_codeview_ghash : Flag<["-"], "gno-codeview-ghash">, Flags<[CoreOption]>;
	def ginline_line_tables : Flag<["-"], "ginline-line-tables">, Flags<[CoreOption]>;
	def gno_inline_line_tables : Flag<["-"], "gno-inline-line-tables">,
	Flags<[CC1Option, CoreOption]>, HelpText<"Don't emit inline line tables">;

	def gfull : Flag<["-"], "gfull">, Group<g_Group>;
	def gused : Flag<["-"], "gused">, Group<g_Group>;
	def gstabs : Joined<["-"], "gstabs">, Group<g_Group>, Flags<[Unsupported]>;
	def gcoff : Joined<["-"], "gcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gxcoff : Joined<["-"], "gxcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gvms : Joined<["-"], "gvms">, Group<g_Group>, Flags<[Unsupported]>;
	def gtoggle : Flag<["-"], "gtoggle">, Group<g_flags_Group>, Flags<[Unsupported]>;
	def grecord_command_line : Flag<["-"], "grecord-command-line">,
	Group<g_flags_Group>;
	def gno_record_command_line : Flag<["-"], "gno-record-command-line">,
	Group<g_flags_Group>;
	def : Flag<["-"], "grecord-gcc-switches">, Alias<grecord_command_line>;
	def : Flag<["-"], "gno-record-gcc-switches">, Alias<gno_record_command_line>;
	def gstrict_dwarf : Flag<["-"], "gstrict-dwarf">, Group<g_flags_Group>;
	def gno_strict_dwarf : Flag<["-"], "gno-strict-dwarf">, Group<g_flags_Group>;
	def gcolumn_info : Flag<["-"], "gcolumn-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gno_column_info : Flag<["-"], "gno-column-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
	def gsplit_dwarf_EQ : Joined<["-"], "gsplit-dwarf=">, Group<g_flags_Group>,
	HelpText<"Set DWARF fission mode to either 'split' or 'single'">,
	Values<"split,single">;
	def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gno_gnu_pubnames : Flag<["-"], "gno-gnu-pubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gpubnames : Flag<["-"], "gpubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gno_pubnames : Flag<["-"], "gno-pubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
	def gmodules : Flag <["-"], "gmodules">, Group<gN_Group>,
	HelpText<"Generate debug info with external references to clang modules"
	" or precompiled headers">;
	def gz : Flag<["-"], "gz">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def gz_EQ : Joined<["-"], "gz=">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def gembed_source : Flag<["-"], "gembed-source">, Group<g_flags_Group>, Flags<[CC1Option]>,
	HelpText<"Embed source text in DWARF debug sections">;
	def gno_embed_source : Flag<["-"], "gno-embed-source">, Group<g_flags_Group>,
	Flags<[DriverOption]>,
	HelpText<"Restore the default behavior of not embedding source text in DWARF debug sections">;
	def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
	def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"Display available options">;
	def index_header_map : Flag<["-"], "index-header-map">, Flags<[CC1Option]>,
	HelpText<"Make the next included directory (-I or -F) an indexer header map">;
	def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to AFTER include search path">;
	def iframework : JoinedOrSeparate<["-"], "iframework">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM framework search path">;
	def iframeworkwithsysroot : JoinedOrSeparate<["-"], "iframeworkwithsysroot">,
	Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM framework search path, "
	"absolute paths are relative to -isysroot">,
	MetaVarName<"<directory>">, Flags<[CC1Option]>;
	def imacros : JoinedOrSeparate<["-", "--"], "imacros">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include macros from file before parsing">, MetaVarName<"<file>">;
	def image__base : Separate<["-"], "image_base">;
	def include_ : JoinedOrSeparate<["-", "--"], "include">, Group<clang_i_Group>, EnumName<"include">,
	MetaVarName<"<file>">, HelpText<"Include file before parsing">, Flags<[CC1Option]>;
	def include_pch : Separate<["-"], "include-pch">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include precompiled header file">, MetaVarName<"<file>">;
	def relocatable_pch : Flag<["-", "--"], "relocatable-pch">, Flags<[CC1Option]>,
	HelpText<"Whether to build a relocatable precompiled header">;
	def verify_pch : Flag<["-"], "verify-pch">, Group<Action_Group>, Flags<[CC1Option]>,
	HelpText<"Load and verify that a pre-compiled header file is not stale">;
	def init : Separate<["-"], "init">;
	def install__name : Separate<["-"], "install_name">;
	def iprefix : JoinedOrSeparate<["-"], "iprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the -iwithprefix/-iwithprefixbefore prefix">, MetaVarName<"<dir>">;
	def iquote : JoinedOrSeparate<["-"], "iquote">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to QUOTE include search path">, MetaVarName<"<directory>">;
	def isysroot : JoinedOrSeparate<["-"], "isysroot">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the system root directory (usually /)">, MetaVarName<"<dir>">;
	def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>,
	Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM include search path">, MetaVarName<"<directory>">;
	def isystem_after : JoinedOrSeparate<["-"], "isystem-after">,
	Group<clang_i_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
	HelpText<"Add directory to end of the SYSTEM include search path">;
	def iwithprefixbefore : JoinedOrSeparate<["-"], "iwithprefixbefore">, Group<clang_i_Group>,
	HelpText<"Set directory to include search path with prefix">, MetaVarName<"<dir>">,
	Flags<[CC1Option]>;
	def iwithprefix : JoinedOrSeparate<["-"], "iwithprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set directory to SYSTEM include search path with prefix">, MetaVarName<"<dir>">;
	def iwithsysroot : JoinedOrSeparate<["-"], "iwithsysroot">, Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM include search path, "
	"absolute paths are relative to -isysroot">, MetaVarName<"<directory>">,
	Flags<[CC1Option]>;
	def ivfsoverlay : JoinedOrSeparate<["-"], "ivfsoverlay">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Overlay the virtual filesystem described by file over the real file system">;
	def imultilib : Separate<["-"], "imultilib">, Group<gfortran_Group>;
	def keep__private__externs : Flag<["-"], "keep_private_externs">;
	def l : JoinedOrSeparate<["-"], "l">, Flags<[LinkerInput, RenderJoined]>,
	Group<Link_Group>;
	def lazy__framework : Separate<["-"], "lazy_framework">, Flags<[LinkerInput]>;
	def lazy__library : Separate<["-"], "lazy_library">, Flags<[LinkerInput]>;
	def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[DriverOption]>;
	def EL : Flag<["-"], "EL">, Alias<mlittle_endian>;
	def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[DriverOption]>;
	def EB : Flag<["-"], "EB">, Alias<mbig_endian>;
	def m16 : Flag<["-"], "m16">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def m32 : Flag<["-"], "m32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mqdsp6_compat : Flag<["-"], "mqdsp6-compat">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Enable hexagon-qdsp6 backward compatibility">;
	def m64 : Flag<["-"], "m64">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mx32 : Flag<["-"], "mx32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mabi_EQ : Joined<["-"], "mabi=">, Group<m_Group>;
	def miamcu : Flag<["-"], "miamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Use Intel MCU ABI">;
	def mno_iamcu : Flag<["-"], "mno-iamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
	def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
	def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
	def malign_branch_EQ : CommaJoined<["-"], "malign-branch=">, Group<m_Group>;
	def malign_branch_boundary_EQ : Joined<["-"], "malign-branch-boundary=">, Group<m_Group>;
	def malign_branch_prefix_size_EQ : Joined<["-"], "malign-branch-prefix-size=">, Group<m_Group>;
	def mbranches_within_32B_boundaries : Flag<["-"], "mbranches-within-32B-boundaries">, Flags<[DriverOption]>, Group<m_Group>;
	def mfancy_math_387 : Flag<["-"], "mfancy-math-387">, Group<clang_ignored_m_Group>;
	def mlong_calls : Flag<["-"], "mlong-calls">, Group<m_Group>,
	HelpText<"Generate branches with extended addressability, usually via indirect jumps.">;
	def LongDouble_Group : OptionGroup<"<LongDouble group>">, Group<m_Group>,
	DocName<"Long double flags">,
	DocBrief<[{Selects the long double implementation}]>;
	def mlong_double_64 : Flag<["-"], "mlong-double-64">, Group<LongDouble_Group>, Flags<[CC1Option]>,
	HelpText<"Force long double to be 64 bits">;
	def mlong_double_80 : Flag<["-"], "mlong-double-80">, Group<LongDouble_Group>, Flags<[CC1Option]>,
	HelpText<"Force long double to be 80 bits, padded to 128 bits for storage">;
	def mlong_double_128 : Flag<["-"], "mlong-double-128">, Group<LongDouble_Group>, Flags<[CC1Option]>,
	HelpText<"Force long double to be 128 bits">;
	def mno_long_calls : Flag<["-"], "mno-long-calls">, Group<m_Group>,
	HelpText<"Restore the default behaviour of not generating long calls">;
	def mexecute_only : Flag<["-"], "mexecute-only">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of data access to code sections (ARM only)">;
	def mno_execute_only : Flag<["-"], "mno-execute-only">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of data access to code sections (ARM only)">;
	def mtp_mode_EQ : Joined<["-"], "mtp=">, Group<m_arm_Features_Group>, Values<"soft,cp15,el0,el1,el2,el3">,
	HelpText<"Thread pointer access method (AArch32/AArch64 only)">;
	def mpure_code : Flag<["-"], "mpure-code">, Alias<mexecute_only>; // Alias for GCC compatibility
	def mno_pure_code : Flag<["-"], "mno-pure-code">, Alias<mno_execute_only>;
	def mtvos_version_min_EQ : Joined<["-"], "mtvos-version-min=">, Group<m_Group>;
	def mappletvos_version_min_EQ : Joined<["-"], "mappletvos-version-min=">, Alias<mtvos_version_min_EQ>;
	def mtvos_simulator_version_min_EQ : Joined<["-"], "mtvos-simulator-version-min=">;
	def mappletvsimulator_version_min_EQ : Joined<["-"], "mappletvsimulator-version-min=">, Alias<mtvos_simulator_version_min_EQ>;
	def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group<m_Group>;
	def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">;
	def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
	def march_EQ : Joined<["-"], "march=">, Group<m_Group>, Flags<[CoreOption]>;
	def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
	def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>;
	def mtls_size_EQ : Joined<["-"], "mtls-size=">, Group<m_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): "
	"12 (for 4KB) \| 24 (for 16MB, default) \| 32 (for 4GB) \| 48 (for 256TB, needs -mcmodel=large)">;
	def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
	def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group<m_Group>;
	def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group<m_Group>;
	def mconstant_cfstrings : Flag<["-"], "mconstant-cfstrings">, Group<clang_ignored_m_Group>;
	def mconsole : Joined<["-"], "mconsole">, Group<m_Group>, Flags<[DriverOption]>;
	def mwindows : Joined<["-"], "mwindows">, Group<m_Group>, Flags<[DriverOption]>;
	def mdll : Joined<["-"], "mdll">, Group<m_Group>, Flags<[DriverOption]>;
	def municode : Joined<["-"], "municode">, Group<m_Group>, Flags<[DriverOption]>;
	def mthreads : Joined<["-"], "mthreads">, Group<m_Group>, Flags<[DriverOption]>;
	def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>;
	def mmcu_EQ : Joined<["-"], "mmcu=">, Group<m_Group>;
	def mdynamic_no_pic : Joined<["-"], "mdynamic-no-pic">, Group<m_Group>;
	def mfix_and_continue : Flag<["-"], "mfix-and-continue">, Group<clang_ignored_m_Group>;
	def mieee_fp : Flag<["-"], "mieee-fp">, Group<clang_ignored_m_Group>;
	def minline_all_stringops : Flag<["-"], "minline-all-stringops">, Group<clang_ignored_m_Group>;
	def mno_inline_all_stringops : Flag<["-"], "mno-inline-all-stringops">, Group<clang_ignored_m_Group>;
	def malign_double : Flag<["-"], "malign-double">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Align doubles to two words in structs (x86 only)">;
	def mfloat_abi_EQ : Joined<["-"], "mfloat-abi=">, Group<m_Group>, Values<"soft,softfp,hard">;
	def mfpmath_EQ : Joined<["-"], "mfpmath=">, Group<m_Group>;
	def mfpu_EQ : Joined<["-"], "mfpu=">, Group<m_Group>;
	def mhwdiv_EQ : Joined<["-"], "mhwdiv=">, Group<m_Group>;
	def mhwmult_EQ : Joined<["-"], "mhwmult=">, Group<m_Group>;
	def mglobal_merge : Flag<["-"], "mglobal-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Enable merging of globals">;
	def mhard_float : Flag<["-"], "mhard-float">, Group<m_Group>;
	def miphoneos_version_min_EQ : Joined<["-"], "miphoneos-version-min=">, Group<m_Group>;
	def mios_version_min_EQ : Joined<["-"], "mios-version-min=">,
	Alias<miphoneos_version_min_EQ>, HelpText<"Set iOS deployment target">;
	def mios_simulator_version_min_EQ : Joined<["-"], "mios-simulator-version-min=">;
	def miphonesimulator_version_min_EQ : Joined<["-"], "miphonesimulator-version-min=">, Alias<mios_simulator_version_min_EQ>;
	def mkernel : Flag<["-"], "mkernel">, Group<m_Group>;
	def mlinker_version_EQ : Joined<["-"], "mlinker-version=">,
	Flags<[DriverOption]>;
	def mllvm : Separate<["-"], "mllvm">, Flags<[CC1Option,CC1AsOption,CoreOption]>,
	HelpText<"Additional arguments to forward to LLVM's option processing">;
	def mmacosx_version_min_EQ : Joined<["-"], "mmacosx-version-min=">,
	Group<m_Group>, HelpText<"Set Mac OS X deployment target">;
	def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
	Group<m_Group>, Alias<mmacosx_version_min_EQ>;
	def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
	def moutline : Flag<["-"], "moutline">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enable function outlining (AArch64 only)">;
	def mno_outline : Flag<["-"], "mno-outline">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Disable function outlining (AArch64 only)">;
	def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
	HelpText<"Do not set the default structure layout to be compatible with the Microsoft compiler standard">;
	def mstackrealign : Flag<["-"], "mstackrealign">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Force realign the stack at entry to every function">;
	def mstack_alignment : Joined<["-"], "mstack-alignment=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack alignment">;
	def mstack_probe_size : Joined<["-"], "mstack-probe-size=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack probe size">;
	def mstack_arg_probe : Flag<["-"], "mstack-arg-probe">, Group<m_Group>,
	HelpText<"Enable stack probes">;
	def mno_stack_arg_probe : Flag<["-"], "mno-stack-arg-probe">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable stack probes which are enabled by default">;
	def mthread_model : Separate<["-"], "mthread-model">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"The thread model to use, e.g. posix, single (posix by default)">, Values<"posix,single">;
	def meabi : Separate<["-"], "meabi">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set EABI type, e.g. 4, 5 or gnu (default depends on triple)">, Values<"default,4,5,gnu">;

	def mno_constant_cfstrings : Flag<["-"], "mno-constant-cfstrings">, Group<m_Group>;
	def mno_global_merge : Flag<["-"], "mno-global-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable merging of globals">;
	def mno_pascal_strings : Flag<["-"], "mno-pascal-strings">,
	Alias<fno_pascal_strings>;
	def mno_red_zone : Flag<["-"], "mno-red-zone">, Group<m_Group>;
	def mno_tls_direct_seg_refs : Flag<["-"], "mno-tls-direct-seg-refs">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable direct TLS access through segment registers">;
	def mno_relax_all : Flag<["-"], "mno-relax-all">, Group<m_Group>;
	def mno_rtd: Flag<["-"], "mno-rtd">, Group<m_Group>;
	def mno_soft_float : Flag<["-"], "mno-soft-float">, Group<m_Group>;
	def mno_stackrealign : Flag<["-"], "mno-stackrealign">, Group<m_Group>;

	def mretpoline : Flag<["-"], "mretpoline">, Group<m_Group>, Flags<[CoreOption,DriverOption]>;
	def mno_retpoline : Flag<["-"], "mno-retpoline">, Group<m_Group>, Flags<[CoreOption,DriverOption]>;
	def mspeculative_load_hardening : Flag<["-"], "mspeculative-load-hardening">,
	Group<m_Group>, Flags<[CoreOption,CC1Option]>;
	def mno_speculative_load_hardening : Flag<["-"], "mno-speculative-load-hardening">,
	Group<m_Group>, Flags<[CoreOption]>;
	+def mlvi_hardening : Flag<["-"], "mlvi-hardening">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	+ HelpText<"Enable all mitigations for Load Value Injection (LVI)">;
	+def mno_lvi_hardening : Flag<["-"], "mno-lvi-hardening">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	+ HelpText<"Disable mitigations for Load Value Injection (LVI)">;
	+def mlvi_cfi : Flag<["-"], "mlvi-cfi">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	+ HelpText<"Enable only control-flow mitigations for Load Value Injection (LVI)">;
	+def mno_lvi_cfi : Flag<["-"], "mno-lvi-cfi">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	+ HelpText<"Disable control-flow mitigations for Load Value Injection (LVI)">;

	def mrelax : Flag<["-"], "mrelax">, Group<m_riscv_Features_Group>,
	HelpText<"Enable linker relaxation">;
	def mno_relax : Flag<["-"], "mno-relax">, Group<m_riscv_Features_Group>,
	HelpText<"Disable linker relaxation">;
	def msave_restore : Flag<["-"], "msave-restore">, Group<m_riscv_Features_Group>,
	HelpText<"Enable using library calls for save and restore">;
	def mno_save_restore : Flag<["-"], "mno-save-restore">, Group<m_riscv_Features_Group>,
	HelpText<"Disable using library calls for save and restore">;
	def mcmodel_EQ_medlow : Flag<["-"], "mcmodel=medlow">, Group<m_riscv_Features_Group>,
	Flags<[CC1Option]>, Alias<mcmodel_EQ>, AliasArgs<["small"]>,
	HelpText<"Equivalent to -mcmodel=small, compatible with RISC-V gcc.">;
	def mcmodel_EQ_medany : Flag<["-"], "mcmodel=medany">, Group<m_riscv_Features_Group>,
	Flags<[CC1Option]>, Alias<mcmodel_EQ>, AliasArgs<["medium"]>,
	HelpText<"Equivalent to -mcmodel=medium, compatible with RISC-V gcc.">;

	def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64 only)">;
	def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Force all memory accesses to be aligned (AArch32/AArch64 only)">;
	def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>, Flags<[CC1Option,HelpHidden]>,
	HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
	def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
	def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.">;
	def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
	def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
	def ffixed_r9 : Flag<["-"], "ffixed-r9">, Group<m_arm_Features_Group>,
	HelpText<"Reserve the r9 register (ARM only)">;
	def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of movt/movw pairs (ARM only)">;
	def mcrc : Flag<["-"], "mcrc">, Group<m_Group>,
	HelpText<"Allow use of CRC instructions (ARM/Mips only)">;
	def mnocrc : Flag<["-"], "mnocrc">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of CRC instructions (ARM only)">;
	def mno_neg_immediates: Flag<["-"], "mno-neg-immediates">, Group<m_arm_Features_Group>,
	HelpText<"Disallow converting instructions with negative immediates to their negation or inversion.">;
	def mcmse : Flag<["-"], "mcmse">, Group<m_arm_Features_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Allow use of CMSE (Armv8-M Security Extensions)">;

	def mgeneral_regs_only : Flag<["-"], "mgeneral-regs-only">, Group<m_aarch64_Features_Group>,
	HelpText<"Generate code which only uses the general purpose registers (AArch64 only)">;
	def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	foreach i = {1-31} in
	def ffixed_x#i : Flag<["-"], "ffixed-x"#i>, Group<m_Group>,
	HelpText<"Reserve the "#i#" register (AArch64/RISC-V only)">;

	foreach i = {8-15,18} in
	def fcall_saved_x#i : Flag<["-"], "fcall-saved-x"#i>, Group<m_aarch64_Features_Group>,
	HelpText<"Make the x"#i#" register call-saved (AArch64 only)">;

	def msign_return_address_EQ : Joined<["-"], "msign-return-address=">,
	Flags<[CC1Option]>, Group<m_Group>, Values<"none,all,non-leaf">,
	HelpText<"Select return address signing scope">;
	def mbranch_protection_EQ : Joined<["-"], "mbranch-protection=">,
	HelpText<"Enforce targets of indirect branches and function returns">;

	def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
	def munimplemented_simd128 : Flag<["-"], "munimplemented-simd128">, Group<m_wasm_Features_Group>;
	def mno_unimplemented_simd128 : Flag<["-"], "mno-unimplemented-simd128">, Group<m_wasm_Features_Group>;
	def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
	def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
	def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
	def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
	def mno_sign_ext : Flag<["-"], "mno-sign-ext">, Group<m_wasm_Features_Group>;
	def mexception_handing : Flag<["-"], "mexception-handling">, Group<m_wasm_Features_Group>;
	def mno_exception_handing : Flag<["-"], "mno-exception-handling">, Group<m_wasm_Features_Group>;
	def matomics : Flag<["-"], "matomics">, Group<m_wasm_Features_Group>;
	def mno_atomics : Flag<["-"], "mno-atomics">, Group<m_wasm_Features_Group>;
	def mbulk_memory : Flag<["-"], "mbulk-memory">, Group<m_wasm_Features_Group>;
	def mno_bulk_memory : Flag<["-"], "mno-bulk-memory">, Group<m_wasm_Features_Group>;
	def mmutable_globals : Flag<["-"], "mmutable-globals">, Group<m_wasm_Features_Group>;
	def mno_mutable_globals : Flag<["-"], "mno-mutable-globals">, Group<m_wasm_Features_Group>;
	def mmultivalue : Flag<["-"], "mmultivalue">, Group<m_wasm_Features_Group>;
	def mno_multivalue : Flag<["-"], "mno-multivalue">, Group<m_wasm_Features_Group>;
	def mtail_call : Flag<["-"], "mtail-call">, Group<m_wasm_Features_Group>;
	def mno_tail_call : Flag<["-"], "mno-tail-call">, Group<m_wasm_Features_Group>;
	def mreference_types : Flag<["-"], "mreference-types">, Group<m_wasm_Features_Group>;
	def mno_reference_types : Flag<["-"], "mno-reference-types">, Group<m_wasm_Features_Group>;

	def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
	Flags<[HelpHidden]>,
	Group<m_Group>,
	HelpText<"Generate additional code for specified <version> of debugger ABI (AMDGPU only)">,
	MetaVarName<"<version>">;

	def mcode_object_v3 : Flag<["-"], "mcode-object-v3">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable code object v3 (AMDGPU only)">;
	def mno_code_object_v3 : Flag<["-"], "mno-code-object-v3">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable code object v3 (AMDGPU only)">;
	def mxnack : Flag<["-"], "mxnack">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable XNACK (AMDGPU only)">;
	def mno_xnack : Flag<["-"], "mno-xnack">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable XNACK (AMDGPU only)">;
	def msram_ecc : Flag<["-"], "msram-ecc">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable SRAM ECC (AMDGPU only)">;
	def mno_sram_ecc : Flag<["-"], "mno-sram-ecc">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable SRAM ECC (AMDGPU only)">;

	def mcumode : Flag<["-"], "mcumode">, Group<m_amdgpu_Features_Group>,
	HelpText<"CU wavefront execution mode is used (AMDGPU only)">;
	def mno_cumode : Flag<["-"], "mno-cumode">, Group<m_amdgpu_Features_Group>,
	HelpText<"WGP wavefront execution mode is used (AMDGPU only)">;

	def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">,
	Group<m_Group>, HelpText<"Wavefront size 64 is used">;
	def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">,
	Group<m_Group>, HelpText<"Wavefront size 32 is used">;

	def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>, Flags<[DriverOption]>;
	def maltivec : Flag<["-"], "maltivec">, Group<m_ppc_Features_Group>;
	def mno_altivec : Flag<["-"], "mno-altivec">, Group<m_ppc_Features_Group>;
	def mspe : Flag<["-"], "mspe">, Group<m_ppc_Features_Group>;
	def mno_spe : Flag<["-"], "mno-spe">, Group<m_ppc_Features_Group>;
	def mvsx : Flag<["-"], "mvsx">, Group<m_ppc_Features_Group>;
	def mno_vsx : Flag<["-"], "mno-vsx">, Group<m_ppc_Features_Group>;
	def msecure_plt : Flag<["-"], "msecure-plt">, Group<m_ppc_Features_Group>;
	def mpower8_vector : Flag<["-"], "mpower8-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power8_vector : Flag<["-"], "mno-power8-vector">,
	Group<m_ppc_Features_Group>;
	def mpower9_vector : Flag<["-"], "mpower9-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power9_vector : Flag<["-"], "mno-power9-vector">,
	Group<m_ppc_Features_Group>;
	def mpower8_crypto : Flag<["-"], "mcrypto">,
	Group<m_ppc_Features_Group>;
	def mnopower8_crypto : Flag<["-"], "mno-crypto">,
	Group<m_ppc_Features_Group>;
	def mdirect_move : Flag<["-"], "mdirect-move">,
	Group<m_ppc_Features_Group>;
	def mnodirect_move : Flag<["-"], "mno-direct-move">,
	Group<m_ppc_Features_Group>;
	def mhtm : Flag<["-"], "mhtm">, Group<m_ppc_Features_Group>;
	def mno_htm : Flag<["-"], "mno-htm">, Group<m_ppc_Features_Group>;
	def mfprnd : Flag<["-"], "mfprnd">, Group<m_ppc_Features_Group>;
	def mno_fprnd : Flag<["-"], "mno-fprnd">, Group<m_ppc_Features_Group>;
	def mcmpb : Flag<["-"], "mcmpb">, Group<m_ppc_Features_Group>;
	def mno_cmpb : Flag<["-"], "mno-cmpb">, Group<m_ppc_Features_Group>;
	def misel : Flag<["-"], "misel">, Group<m_ppc_Features_Group>;
	def mno_isel : Flag<["-"], "mno-isel">, Group<m_ppc_Features_Group>;
	def mmfocrf : Flag<["-"], "mmfocrf">, Group<m_ppc_Features_Group>;
	def mmfcrf : Flag<["-"], "mmfcrf">, Alias<mmfocrf>;
	def mno_mfocrf : Flag<["-"], "mno-mfocrf">, Group<m_ppc_Features_Group>;
	def mno_mfcrf : Flag<["-"], "mno-mfcrf">, Alias<mno_mfocrf>;
	def mpopcntd : Flag<["-"], "mpopcntd">, Group<m_ppc_Features_Group>;
	def mno_popcntd : Flag<["-"], "mno-popcntd">, Group<m_ppc_Features_Group>;
	def mqpx : Flag<["-"], "mqpx">, Group<m_ppc_Features_Group>;
	def mno_qpx : Flag<["-"], "mno-qpx">, Group<m_ppc_Features_Group>;
	def mcrbits : Flag<["-"], "mcrbits">, Group<m_ppc_Features_Group>;
	def mno_crbits : Flag<["-"], "mno-crbits">, Group<m_ppc_Features_Group>;
	def minvariant_function_descriptors :
	Flag<["-"], "minvariant-function-descriptors">, Group<m_ppc_Features_Group>;
	def mno_invariant_function_descriptors :
	Flag<["-"], "mno-invariant-function-descriptors">,
	Group<m_ppc_Features_Group>;
	def mfloat128: Flag<["-"], "mfloat128">,
	Group<m_ppc_Features_Group>;
	def mno_float128 : Flag<["-"], "mno-float128">,
	Group<m_ppc_Features_Group>;
	def mlongcall: Flag<["-"], "mlongcall">,
	Group<m_ppc_Features_Group>;
	def mno_longcall : Flag<["-"], "mno-longcall">,
	Group<m_ppc_Features_Group>;
	def maix_struct_return : Flag<["-"], "maix-struct-return">,
	Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Return all structs in memory (PPC32 only)">;
	def msvr4_struct_return : Flag<["-"], "msvr4-struct-return">,
	Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Return small structs in registers (PPC32 only)">;

	def mvx : Flag<["-"], "mvx">, Group<m_Group>;
	def mno_vx : Flag<["-"], "mno-vx">, Group<m_Group>;

	def fzvector : Flag<["-"], "fzvector">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable System z vector language extension">;
	def fno_zvector : Flag<["-"], "fno-zvector">, Group<f_Group>,
	Flags<[CC1Option]>;
	def mzvector : Flag<["-"], "mzvector">, Alias<fzvector>;
	def mno_zvector : Flag<["-"], "mno-zvector">, Alias<fno_zvector>;

	def mbackchain : Flag<["-"], "mbackchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Link stack frames through backchain on System Z">;
	def mno_backchain : Flag<["-"], "mno-backchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>;

	def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings">, Group<m_Group>;
	def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
	def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
	HelpText<"Omit frame pointer setup for leaf functions">;
	def moslib_EQ : Joined<["-"], "moslib=">, Group<m_Group>;
	def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias<fpascal_strings>;
	def mred_zone : Flag<["-"], "mred-zone">, Group<m_Group>;
	def mtls_direct_seg_refs : Flag<["-"], "mtls-direct-seg-refs">, Group<m_Group>,
	HelpText<"Enable direct TLS access through segment registers (default)">;
	def mregparm_EQ : Joined<["-"], "mregparm=">, Group<m_Group>;
	def mrelax_all : Flag<["-"], "mrelax-all">, Group<m_Group>, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Relax all machine instructions">;
	def mincremental_linker_compatible : Flag<["-"], "mincremental-linker-compatible">, Group<m_Group>,
	Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Emit an object file which can be used with an incremental linker">;
	def mno_incremental_linker_compatible : Flag<["-"], "mno-incremental-linker-compatible">, Group<m_Group>,
	HelpText<"(integrated-as) Emit an object file which cannot be used with an incremental linker">;
	def mrtd : Flag<["-"], "mrtd">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Make StdCall calling convention the default">;
	def msmall_data_threshold_EQ : Joined <["-"], "msmall-data-threshold=">,
	Group<m_Group>, Alias<G>;
	def msoft_float : Flag<["-"], "msoft-float">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Use software floating point">;
	def mno_implicit_float : Flag<["-"], "mno-implicit-float">, Group<m_Group>,
	HelpText<"Don't generate implicit floating point instructions">;
	def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
	def mrecip : Flag<["-"], "mrecip">, Group<m_Group>;
	def mrecip_EQ : CommaJoined<["-"], "mrecip=">, Group<m_Group>, Flags<[CC1Option]>;
	def mprefer_vector_width_EQ : Joined<["-"], "mprefer-vector-width=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions.">;
	def mpie_copy_relocations : Flag<["-"], "mpie-copy-relocations">, Group<m_Group>,
	Flags<[CC1Option]>,
	HelpText<"Use copy relocations support for PIE builds">;
	def mno_pie_copy_relocations : Flag<["-"], "mno-pie-copy-relocations">, Group<m_Group>;
	def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86/SystemZ only)">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mrecord_mcount : Flag<["-"], "mrecord-mcount">, HelpText<"Generate a __mcount_loc section entry for each __fentry__ call.">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mpacked_stack : Flag<["-"], "mpacked-stack">, HelpText<"Use packed stack layout (SystemZ only).">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mno_packed_stack : Flag<["-"], "mno-packed-stack">, Flags<[CC1Option]>, Group<m_Group>;
	def mips16 : Flag<["-"], "mips16">, Group<m_mips_Features_Group>;
	def mno_mips16 : Flag<["-"], "mno-mips16">, Group<m_mips_Features_Group>;
	def mmicromips : Flag<["-"], "mmicromips">, Group<m_mips_Features_Group>;
	def mno_micromips : Flag<["-"], "mno-micromips">, Group<m_mips_Features_Group>;
	def mxgot : Flag<["-"], "mxgot">, Group<m_mips_Features_Group>;
	def mno_xgot : Flag<["-"], "mno-xgot">, Group<m_mips_Features_Group>;
	def mldc1_sdc1 : Flag<["-"], "mldc1-sdc1">, Group<m_mips_Features_Group>;
	def mno_ldc1_sdc1 : Flag<["-"], "mno-ldc1-sdc1">, Group<m_mips_Features_Group>;
	def mcheck_zero_division : Flag<["-"], "mcheck-zero-division">,
	Group<m_mips_Features_Group>;
	def mno_check_zero_division : Flag<["-"], "mno-check-zero-division">,
	Group<m_mips_Features_Group>;
	def mcompact_branches_EQ : Joined<["-"], "mcompact-branches=">,
	Group<m_mips_Features_Group>;
	def mbranch_likely : Flag<["-"], "mbranch-likely">, Group<m_Group>,
	IgnoredGCCCompat;
	def mno_branch_likely : Flag<["-"], "mno-branch-likely">, Group<m_Group>,
	IgnoredGCCCompat;
	def mindirect_jump_EQ : Joined<["-"], "mindirect-jump=">,
	Group<m_mips_Features_Group>,
	HelpText<"Change indirect jump instructions to inhibit speculation">;
	def mdsp : Flag<["-"], "mdsp">, Group<m_mips_Features_Group>;
	def mno_dsp : Flag<["-"], "mno-dsp">, Group<m_mips_Features_Group>;
	def mdspr2 : Flag<["-"], "mdspr2">, Group<m_mips_Features_Group>;
	def mno_dspr2 : Flag<["-"], "mno-dspr2">, Group<m_mips_Features_Group>;
	def msingle_float : Flag<["-"], "msingle-float">, Group<m_mips_Features_Group>;
	def mdouble_float : Flag<["-"], "mdouble-float">, Group<m_mips_Features_Group>;
	def mmadd4 : Flag<["-"], "mmadd4">, Group<m_mips_Features_Group>,
	HelpText<"Enable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mno_madd4 : Flag<["-"], "mno-madd4">, Group<m_mips_Features_Group>,
	HelpText<"Disable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mmsa : Flag<["-"], "mmsa">, Group<m_mips_Features_Group>,
	HelpText<"Enable MSA ASE (MIPS only)">;
	def mno_msa : Flag<["-"], "mno-msa">, Group<m_mips_Features_Group>,
	HelpText<"Disable MSA ASE (MIPS only)">;
	def mmt : Flag<["-"], "mmt">, Group<m_mips_Features_Group>,
	HelpText<"Enable MT ASE (MIPS only)">;
	def mno_mt : Flag<["-"], "mno-mt">, Group<m_mips_Features_Group>,
	HelpText<"Disable MT ASE (MIPS only)">;
	def mfp64 : Flag<["-"], "mfp64">, Group<m_mips_Features_Group>,
	HelpText<"Use 64-bit floating point registers (MIPS only)">;
	def mfp32 : Flag<["-"], "mfp32">, Group<m_mips_Features_Group>,
	HelpText<"Use 32-bit floating point registers (MIPS only)">;
	def mgpopt : Flag<["-"], "mgpopt">, Group<m_mips_Features_Group>,
	HelpText<"Use GP relative accesses for symbols known to be in a small"
	" data section (MIPS)">;
	def mno_gpopt : Flag<["-"], "mno-gpopt">, Group<m_mips_Features_Group>,
	HelpText<"Do not use GP relative accesses for symbols known to be in a small"
	" data section (MIPS)">;
	def mlocal_sdata : Flag<["-"], "mlocal-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Extend the -G behaviour to object local data (MIPS)">;
	def mno_local_sdata : Flag<["-"], "mno-local-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not extend the -G behaviour to object local data (MIPS)">;
	def mextern_sdata : Flag<["-"], "mextern-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Assume that externally defined data is in the small data if it"
	" meets the -G <size> threshold (MIPS)">;
	def mno_extern_sdata : Flag<["-"], "mno-extern-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not assume that externally defined data is in the small data if"
	" it meets the -G <size> threshold (MIPS)">;
	def membedded_data : Flag<["-"], "membedded-data">,
	Group<m_mips_Features_Group>,
	HelpText<"Place constants in the .rodata section instead of the .sdata "
	"section even if they meet the -G <size> threshold (MIPS)">;
	def mno_embedded_data : Flag<["-"], "mno-embedded-data">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not place constants in the .rodata section instead of the "
	".sdata if they meet the -G <size> threshold (MIPS)">;
	def mnan_EQ : Joined<["-"], "mnan=">, Group<m_mips_Features_Group>;
	def mabs_EQ : Joined<["-"], "mabs=">, Group<m_mips_Features_Group>;
	def mabicalls : Flag<["-"], "mabicalls">, Group<m_mips_Features_Group>,
	HelpText<"Enable SVR4-style position-independent code (Mips only)">;
	def mno_abicalls : Flag<["-"], "mno-abicalls">, Group<m_mips_Features_Group>,
	HelpText<"Disable SVR4-style position-independent code (Mips only)">;
	def mno_crc : Flag<["-"], "mno-crc">, Group<m_mips_Features_Group>,
	HelpText<"Disallow use of CRC instructions (Mips only)">;
	def mvirt : Flag<["-"], "mvirt">, Group<m_mips_Features_Group>;
	def mno_virt : Flag<["-"], "mno-virt">, Group<m_mips_Features_Group>;
	def mginv : Flag<["-"], "mginv">, Group<m_mips_Features_Group>;
	def mno_ginv : Flag<["-"], "mno-ginv">, Group<m_mips_Features_Group>;
	def mips1 : Flag<["-"], "mips1">,
	Alias<march_EQ>, AliasArgs<["mips1"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips1">, Flags<[HelpHidden]>;
	def mips2 : Flag<["-"], "mips2">,
	Alias<march_EQ>, AliasArgs<["mips2"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips2">, Flags<[HelpHidden]>;
	def mips3 : Flag<["-"], "mips3">,
	Alias<march_EQ>, AliasArgs<["mips3"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips3">, Flags<[HelpHidden]>;
	def mips4 : Flag<["-"], "mips4">,
	Alias<march_EQ>, AliasArgs<["mips4"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips4">, Flags<[HelpHidden]>;
	def mips5 : Flag<["-"], "mips5">,
	Alias<march_EQ>, AliasArgs<["mips5"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips5">, Flags<[HelpHidden]>;
	def mips32 : Flag<["-"], "mips32">,
	Alias<march_EQ>, AliasArgs<["mips32"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32">, Flags<[HelpHidden]>;
	def mips32r2 : Flag<["-"], "mips32r2">,
	Alias<march_EQ>, AliasArgs<["mips32r2"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r2">, Flags<[HelpHidden]>;
	def mips32r3 : Flag<["-"], "mips32r3">,
	Alias<march_EQ>, AliasArgs<["mips32r3"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r3">, Flags<[HelpHidden]>;
	def mips32r5 : Flag<["-"], "mips32r5">,
	Alias<march_EQ>, AliasArgs<["mips32r5"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r5">, Flags<[HelpHidden]>;
	def mips32r6 : Flag<["-"], "mips32r6">,
	Alias<march_EQ>, AliasArgs<["mips32r6"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r6">, Flags<[HelpHidden]>;
	def mips64 : Flag<["-"], "mips64">,
	Alias<march_EQ>, AliasArgs<["mips64"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64">, Flags<[HelpHidden]>;
	def mips64r2 : Flag<["-"], "mips64r2">,
	Alias<march_EQ>, AliasArgs<["mips64r2"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r2">, Flags<[HelpHidden]>;
	def mips64r3 : Flag<["-"], "mips64r3">,
	Alias<march_EQ>, AliasArgs<["mips64r3"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r3">, Flags<[HelpHidden]>;
	def mips64r5 : Flag<["-"], "mips64r5">,
	Alias<march_EQ>, AliasArgs<["mips64r5"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r5">, Flags<[HelpHidden]>;
	def mips64r6 : Flag<["-"], "mips64r6">,
	Alias<march_EQ>, AliasArgs<["mips64r6"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r6">, Flags<[HelpHidden]>;
	def mfpxx : Flag<["-"], "mfpxx">, Group<m_mips_Features_Group>,
	HelpText<"Avoid FPU mode dependent operations when used with the O32 ABI">,
	Flags<[HelpHidden]>;
	def modd_spreg : Flag<["-"], "modd-spreg">, Group<m_mips_Features_Group>,
	HelpText<"Enable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mno_odd_spreg : Flag<["-"], "mno-odd-spreg">, Group<m_mips_Features_Group>,
	HelpText<"Disable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mrelax_pic_calls : Flag<["-"], "mrelax-pic-calls">,
	Group<m_mips_Features_Group>,
	HelpText<"Produce relaxation hints for linkers to try optimizing PIC "
	"call sequences into direct calls (MIPS only)">, Flags<[HelpHidden]>;
	def mno_relax_pic_calls : Flag<["-"], "mno-relax-pic-calls">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not produce relaxation hints for linkers to try optimizing PIC "
	"call sequences into direct calls (MIPS only)">, Flags<[HelpHidden]>;
	def mglibc : Flag<["-"], "mglibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def muclibc : Flag<["-"], "muclibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def module_file_info : Flag<["-"], "module-file-info">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Provide information about a particular module file">;
	def mthumb : Flag<["-"], "mthumb">, Group<m_Group>;
	def mtune_EQ : Joined<["-"], "mtune=">, Group<m_Group>;
	def multi__module : Flag<["-"], "multi_module">;
	def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">;
	def multiply__defined : Separate<["-"], "multiply_defined">;
	def mwarn_nonportable_cfstrings : Flag<["-"], "mwarn-nonportable-cfstrings">, Group<m_Group>;
	def no_canonical_prefixes : Flag<["-"], "no-canonical-prefixes">, Flags<[HelpHidden, CoreOption]>,
	HelpText<"Use relative instead of canonical paths">;
	def no_cpp_precomp : Flag<["-"], "no-cpp-precomp">, Group<clang_ignored_f_Group>;
	def no_integrated_cpp : Flag<["-", "--"], "no-integrated-cpp">, Flags<[DriverOption]>;
	def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
	def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
	def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable builtin #include directories">;
	def nocudainc : Flag<["-"], "nocudainc">;
	def nogpulib : Flag<["-"], "nogpulib">,
	HelpText<"Do not link device library for CUDA/HIP device compilation">;
	def : Flag<["-"], "nocudalib">, Alias<nogpulib>;
	def nodefaultlibs : Flag<["-"], "nodefaultlibs">;
	def nofixprebinding : Flag<["-"], "nofixprebinding">;
	def nolibc : Flag<["-"], "nolibc">;
	def nomultidefs : Flag<["-"], "nomultidefs">;
	def nopie : Flag<["-"], "nopie">;
	def no_pie : Flag<["-"], "no-pie">, Alias<nopie>;
	def noprebind : Flag<["-"], "noprebind">;
	def noprofilelib : Flag<["-"], "noprofilelib">;
	def noseglinkedit : Flag<["-"], "noseglinkedit">;
	def nostartfiles : Flag<["-"], "nostartfiles">;
	def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>;
	def nostdlibinc : Flag<["-"], "nostdlibinc">;
	def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>,
	HelpText<"Disable standard #include directories for the C++ standard library">;
	def nostdlib : Flag<["-"], "nostdlib">;
	def nostdlibxx : Flag<["-"], "nostdlib++">;
	def object : Flag<["-"], "object">;
	def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>,
	HelpText<"Write output to <file>">, MetaVarName<"<file>">;
	def pagezero__size : JoinedOrSeparate<["-"], "pagezero_size">;
	def pass_exit_codes : Flag<["-", "--"], "pass-exit-codes">, Flags<[Unsupported]>;
	def pedantic_errors : Flag<["-", "--"], "pedantic-errors">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pedantic : Flag<["-", "--"], "pedantic">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, Flags<[CC1Option]>;
	def pipe : Flag<["-", "--"], "pipe">,
	HelpText<"Use pipes between commands, when possible">;
	def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
	def prebind : Flag<["-"], "prebind">;
	def preload : Flag<["-"], "preload">;
	def print_file_name_EQ : Joined<["-", "--"], "print-file-name=">,
	HelpText<"Print the full library path of <file>">, MetaVarName<"<file>">;
	def print_ivar_layout : Flag<["-"], "print-ivar-layout">, Flags<[CC1Option]>,
	HelpText<"Enable Objective-C Ivar layout bitmap print trace">;
	def print_libgcc_file_name : Flag<["-", "--"], "print-libgcc-file-name">,
	HelpText<"Print the library path for the currently used compiler runtime "
	"library (\"libgcc.a\" or \"libclang_rt.builtins.*.a\")">;
	def print_multi_directory : Flag<["-", "--"], "print-multi-directory">;
	def print_multi_lib : Flag<["-", "--"], "print-multi-lib">;
	def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">,
	Flags<[Unsupported]>;
	def print_target_triple : Flag<["-", "--"], "print-target-triple">,
	HelpText<"Print the normalized target triple">;
	def print_effective_triple : Flag<["-", "--"], "print-effective-triple">,
	HelpText<"Print the effective target triple">;
	def print_prog_name_EQ : Joined<["-", "--"], "print-prog-name=">,
	HelpText<"Print the full program path of <name>">, MetaVarName<"<name>">;
	def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
	HelpText<"Print the resource directory pathname">;
	def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
	HelpText<"Print the paths used for finding libraries and programs">;
	def private__bundle : Flag<["-"], "private_bundle">;
	def pthreads : Flag<["-"], "pthreads">;
	def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>,
	HelpText<"Support POSIX threads in generated code">;
	def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>;
	def p : Flag<["-"], "p">;
	def pie : Flag<["-"], "pie">;
	def static_pie : Flag<["-"], "static-pie">;
	def read__only__relocs : Separate<["-"], "read_only_relocs">;
	def remap : Flag<["-"], "remap">;
	def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Rewrite Objective-C source to C++">, Group<Action_Group>;
	def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>,
	HelpText<"Rewrite Legacy Objective-C source to C++">;
	def rdynamic : Flag<["-"], "rdynamic">;
	def resource_dir : Separate<["-"], "resource-dir">,
	Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>,
	HelpText<"The directory which holds the compiler resource files">;
	def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[DriverOption, CoreOption]>,
	Alias<resource_dir>;
	def rpath : Separate<["-"], "rpath">, Flags<[LinkerInput]>, Group<Link_Group>;
	def rtlib_EQ : Joined<["-", "--"], "rtlib=">,
	HelpText<"Compiler runtime library to use">;
	def frtlib_add_rpath: Flag<["-"], "frtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Add -rpath with architecture-specific resource directory to the linker flags">;
	def fno_rtlib_add_rpath: Flag<["-"], "fno-rtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Do not add -rpath with architecture-specific resource directory to the linker flags">;
	def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
	Group<Link_Group>;
	def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[CC1Option, DriverOption]>,
	HelpText<"Save intermediate compilation results.">;
	def save_temps : Flag<["-", "--"], "save-temps">, Flags<[DriverOption]>,
	Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save intermediate compilation results">;
	def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[DriverOption]>,
	HelpText<"Save llvm statistics.">;
	def save_stats : Flag<["-", "--"], "save-stats">, Flags<[DriverOption]>,
	Alias<save_stats_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save llvm statistics.">;
	def via_file_asm : Flag<["-", "--"], "via-file-asm">, InternalDebugOpt,
	HelpText<"Write assembly to file for input to assemble jobs">;
	def sectalign : MultiArg<["-"], "sectalign", 3>;
	def sectcreate : MultiArg<["-"], "sectcreate", 3>;
	def sectobjectsymbols : MultiArg<["-"], "sectobjectsymbols", 2>;
	def sectorder : MultiArg<["-"], "sectorder", 3>;
	def seg1addr : JoinedOrSeparate<["-"], "seg1addr">;
	def seg__addr__table__filename : Separate<["-"], "seg_addr_table_filename">;
	def seg__addr__table : Separate<["-"], "seg_addr_table">;
	def segaddr : MultiArg<["-"], "segaddr", 2>;
	def segcreate : MultiArg<["-"], "segcreate", 3>;
	def seglinkedit : Flag<["-"], "seglinkedit">;
	def segprot : MultiArg<["-"], "segprot", 3>;
	def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">;
	def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">;
	def segs__read__ : Joined<["-"], "segs_read_">;
	def shared_libgcc : Flag<["-"], "shared-libgcc">;
	def shared : Flag<["-", "--"], "shared">;
	def single__module : Flag<["-"], "single_module">;
	def specs_EQ : Joined<["-", "--"], "specs=">;
	def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>;
	def static_libgcc : Flag<["-"], "static-libgcc">;
	def static_libstdcxx : Flag<["-"], "static-libstdc++">;
	def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>;
	def std_default_EQ : Joined<["-"], "std-default=">;
	def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>,
	Group<CompileOnly_Group>, HelpText<"Language standard to compile for">,
	ValuesCode<[{
	const char *Values =
	#define LANGSTANDARD(id, name, lang, desc, features) name ","
	#define LANGSTANDARD_ALIAS(id, alias) alias ","
	#include "clang/Basic/LangStandards.def"
	;
	}]>;
	def stdlib_EQ : Joined<["-", "--"], "stdlib=">, Flags<[CC1Option]>,
	HelpText<"C++ standard library to use">, Values<"libc++,libstdc++,platform">;
	def stdlibxx_isystem : JoinedOrSeparate<["-"], "stdlib++-isystem">,
	Group<clang_i_Group>,
	HelpText<"Use directory as the C++ standard library include path">,
	Flags<[DriverOption]>, MetaVarName<"<directory>">;
	def unwindlib_EQ : Joined<["-", "--"], "unwindlib=">, Flags<[CC1Option]>,
	HelpText<"Unwind library to use">, Values<"libgcc,unwindlib,platform">;
	def sub__library : JoinedOrSeparate<["-"], "sub_library">;
	def sub__umbrella : JoinedOrSeparate<["-"], "sub_umbrella">;
	def system_header_prefix : Joined<["--"], "system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as including a "
	"system header.">;
	def : Separate<["--"], "system-header-prefix">, Alias<system_header_prefix>;
	def no_system_header_prefix : Joined<["--"], "no-system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as not including a "
	"system header.">;
	def : Separate<["--"], "no-system-header-prefix">, Alias<no_system_header_prefix>;
	def s : Flag<["-"], "s">, Group<Link_Group>;
	def target : Joined<["--"], "target=">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Generate code for the given target">;
	def print_supported_cpus : Flag<["-", "--"], "print-supported-cpus">,
	Group<CompileOnly_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Print supported cpu models for the given target (if target is not specified,"
	" it will print the supported cpus for the default target)">;
	def mcpu_EQ_QUESTION : Flag<["-"], "mcpu=?">, Alias<print_supported_cpus>;
	def mtune_EQ_QUESTION : Flag<["-"], "mtune=?">, Alias<print_supported_cpus>;
	def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[DriverOption]>,
	HelpText<"Use the gcc toolchain at the given directory">;
	def time : Flag<["-"], "time">,
	HelpText<"Time individual commands">;
	def traditional_cpp : Flag<["-", "--"], "traditional-cpp">, Flags<[CC1Option]>,
	HelpText<"Enable some traditional CPP emulation">;
	def traditional : Flag<["-", "--"], "traditional">;
	def trigraphs : Flag<["-", "--"], "trigraphs">, Alias<ftrigraphs>,
	HelpText<"Process trigraph sequences">;
	def twolevel__namespace__hints : Flag<["-"], "twolevel_namespace_hints">;
	def twolevel__namespace : Flag<["-"], "twolevel_namespace">;
	def t : Flag<["-"], "t">, Group<Link_Group>;
	def umbrella : Separate<["-"], "umbrella">;
	def undefined : JoinedOrSeparate<["-"], "undefined">, Group<u_Group>;
	def undef : Flag<["-"], "undef">, Group<u_Group>, Flags<[CC1Option]>,
	HelpText<"undef all system defines">;
	def unexported__symbols__list : Separate<["-"], "unexported_symbols_list">;
	def u : JoinedOrSeparate<["-"], "u">, Group<u_Group>;
	def v : Flag<["-"], "v">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Show commands to run and use verbose output">;
	def verify_debug_info : Flag<["--"], "verify-debug-info">, Flags<[DriverOption]>,
	HelpText<"Verify the binary representation of debug output">;
	def weak_l : Joined<["-"], "weak-l">, Flags<[LinkerInput]>;
	def weak__framework : Separate<["-"], "weak_framework">, Flags<[LinkerInput]>;
	def weak__library : Separate<["-"], "weak_library">, Flags<[LinkerInput]>;
	def weak__reference__mismatches : Separate<["-"], "weak_reference_mismatches">;
	def whatsloaded : Flag<["-"], "whatsloaded">;
	def whyload : Flag<["-"], "whyload">;
	def w : Flag<["-"], "w">, HelpText<"Suppress all warnings">, Flags<[CC1Option]>;
	def x : JoinedOrSeparate<["-"], "x">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Treat subsequent input files as having type <language>">,
	MetaVarName<"<language>">;
	def y : Joined<["-"], "y">;

	def fintegrated_as : Flag<["-"], "fintegrated-as">, Flags<[DriverOption]>,
	Group<f_Group>, HelpText<"Enable the integrated assembler">;
	def fno_integrated_as : Flag<["-"], "fno-integrated-as">,
	Flags<[CC1Option, DriverOption]>, Group<f_Group>,
	HelpText<"Disable the integrated assembler">;

	def fintegrated_cc1 : Flag<["-"], "fintegrated-cc1">,
	Flags<[CoreOption, DriverOption]>, Group<f_Group>,
	HelpText<"Run cc1 in-process">;
	def fno_integrated_cc1 : Flag<["-"], "fno-integrated-cc1">,
	Flags<[CoreOption, DriverOption]>, Group<f_Group>,
	HelpText<"Spawn a separate process for each cc1">;

	def : Flag<["-"], "integrated-as">, Alias<fintegrated_as>, Flags<[DriverOption]>;
	def : Flag<["-"], "no-integrated-as">, Alias<fno_integrated_as>,
	Flags<[CC1Option, DriverOption]>;

	def working_directory : JoinedOrSeparate<["-"], "working-directory">, Flags<[CC1Option]>,
	HelpText<"Resolve file paths relative to the specified directory">;
	def working_directory_EQ : Joined<["-"], "working-directory=">, Flags<[CC1Option]>,
	Alias<working_directory>;

	// Double dash options, which are usually an alias for one of the previous
	// options.

	def _mhwdiv_EQ : Joined<["--"], "mhwdiv=">, Alias<mhwdiv_EQ>;
	def _mhwdiv : Separate<["--"], "mhwdiv">, Alias<mhwdiv_EQ>;
	def _CLASSPATH_EQ : Joined<["--"], "CLASSPATH=">, Alias<fclasspath_EQ>;
	def _CLASSPATH : Separate<["--"], "CLASSPATH">, Alias<fclasspath_EQ>;
	def _all_warnings : Flag<["--"], "all-warnings">, Alias<Wall>;
	def _analyzer_no_default_checks : Flag<["--"], "analyzer-no-default-checks">, Flags<[DriverOption]>;
	def _analyzer_output : JoinedOrSeparate<["--"], "analyzer-output">, Flags<[DriverOption]>,
	HelpText<"Static analyzer report output format (html\|plist\|plist-multi-file\|plist-html\|sarif\|text).">;
	def _analyze : Flag<["--"], "analyze">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Run the static analyzer">;
	def _assemble : Flag<["--"], "assemble">, Alias<S>;
	def _assert_EQ : Joined<["--"], "assert=">, Alias<A>;
	def _assert : Separate<["--"], "assert">, Alias<A>;
	def _bootclasspath_EQ : Joined<["--"], "bootclasspath=">, Alias<fbootclasspath_EQ>;
	def _bootclasspath : Separate<["--"], "bootclasspath">, Alias<fbootclasspath_EQ>;
	def _classpath_EQ : Joined<["--"], "classpath=">, Alias<fclasspath_EQ>;
	def _classpath : Separate<["--"], "classpath">, Alias<fclasspath_EQ>;
	def _comments_in_macros : Flag<["--"], "comments-in-macros">, Alias<CC>;
	def _comments : Flag<["--"], "comments">, Alias<C>;
	def _compile : Flag<["--"], "compile">, Alias<c>;
	def _constant_cfstrings : Flag<["--"], "constant-cfstrings">;
	def _debug_EQ : Joined<["--"], "debug=">, Alias<g_Flag>;
	def _debug : Flag<["--"], "debug">, Alias<g_Flag>;
	def _define_macro_EQ : Joined<["--"], "define-macro=">, Alias<D>;
	def _define_macro : Separate<["--"], "define-macro">, Alias<D>;
	def _dependencies : Flag<["--"], "dependencies">, Alias<M>;
	def _dyld_prefix_EQ : Joined<["--"], "dyld-prefix=">;
	def _dyld_prefix : Separate<["--"], "dyld-prefix">, Alias<_dyld_prefix_EQ>;
	def _encoding_EQ : Joined<["--"], "encoding=">, Alias<fencoding_EQ>;
	def _encoding : Separate<["--"], "encoding">, Alias<fencoding_EQ>;
	def _entry : Flag<["--"], "entry">, Alias<e>;
	def _extdirs_EQ : Joined<["--"], "extdirs=">, Alias<fextdirs_EQ>;
	def _extdirs : Separate<["--"], "extdirs">, Alias<fextdirs_EQ>;
	def _extra_warnings : Flag<["--"], "extra-warnings">, Alias<W_Joined>;
	def _for_linker_EQ : Joined<["--"], "for-linker=">, Alias<Xlinker>;
	def _for_linker : Separate<["--"], "for-linker">, Alias<Xlinker>;
	def _force_link_EQ : Joined<["--"], "force-link=">, Alias<u>;
	def _force_link : Separate<["--"], "force-link">, Alias<u>;
	def _help_hidden : Flag<["--"], "help-hidden">,
	HelpText<"Display help for hidden options">;
	def _imacros_EQ : Joined<["--"], "imacros=">, Alias<imacros>;
	def _include_barrier : Flag<["--"], "include-barrier">, Alias<I_>;
	def _include_directory_after_EQ : Joined<["--"], "include-directory-after=">, Alias<idirafter>;
	def _include_directory_after : Separate<["--"], "include-directory-after">, Alias<idirafter>;
	def _include_directory_EQ : Joined<["--"], "include-directory=">, Alias<I>;
	def _include_directory : Separate<["--"], "include-directory">, Alias<I>;
	def _include_prefix_EQ : Joined<["--"], "include-prefix=">, Alias<iprefix>;
	def _include_prefix : Separate<["--"], "include-prefix">, Alias<iprefix>;
	def _include_with_prefix_after_EQ : Joined<["--"], "include-with-prefix-after=">, Alias<iwithprefix>;
	def _include_with_prefix_after : Separate<["--"], "include-with-prefix-after">, Alias<iwithprefix>;
	def _include_with_prefix_before_EQ : Joined<["--"], "include-with-prefix-before=">, Alias<iwithprefixbefore>;
	def _include_with_prefix_before : Separate<["--"], "include-with-prefix-before">, Alias<iwithprefixbefore>;
	def _include_with_prefix_EQ : Joined<["--"], "include-with-prefix=">, Alias<iwithprefix>;
	def _include_with_prefix : Separate<["--"], "include-with-prefix">, Alias<iwithprefix>;
	def _include_EQ : Joined<["--"], "include=">, Alias<include_>;
	def _language_EQ : Joined<["--"], "language=">, Alias<x>;
	def _language : Separate<["--"], "language">, Alias<x>;
	def _library_directory_EQ : Joined<["--"], "library-directory=">, Alias<L>;
	def _library_directory : Separate<["--"], "library-directory">, Alias<L>;
	def _no_line_commands : Flag<["--"], "no-line-commands">, Alias<P>;
	def _no_standard_includes : Flag<["--"], "no-standard-includes">, Alias<nostdinc>;
	def _no_standard_libraries : Flag<["--"], "no-standard-libraries">, Alias<nostdlib>;
	def _no_undefined : Flag<["--"], "no-undefined">, Flags<[LinkerInput]>;
	def _no_warnings : Flag<["--"], "no-warnings">, Alias<w>;
	def _optimize_EQ : Joined<["--"], "optimize=">, Alias<O>;
	def _optimize : Flag<["--"], "optimize">, Alias<O>;
	def _output_class_directory_EQ : Joined<["--"], "output-class-directory=">, Alias<foutput_class_dir_EQ>;
	def _output_class_directory : Separate<["--"], "output-class-directory">, Alias<foutput_class_dir_EQ>;
	def _output_EQ : Joined<["--"], "output=">, Alias<o>;
	def _output : Separate<["--"], "output">, Alias<o>;
	def _param : Separate<["--"], "param">, Group<CompileOnly_Group>;
	def _param_EQ : Joined<["--"], "param=">, Alias<_param>;
	def _precompile : Flag<["--"], "precompile">, Flags<[DriverOption]>,
	Group<Action_Group>, HelpText<"Only precompile the input">;
	def _prefix_EQ : Joined<["--"], "prefix=">, Alias<B>;
	def _prefix : Separate<["--"], "prefix">, Alias<B>;
	def _preprocess : Flag<["--"], "preprocess">, Alias<E>;
	def _print_diagnostic_categories : Flag<["--"], "print-diagnostic-categories">;
	def _print_file_name : Separate<["--"], "print-file-name">, Alias<print_file_name_EQ>;
	def _print_missing_file_dependencies : Flag<["--"], "print-missing-file-dependencies">, Alias<MG>;
	def _print_prog_name : Separate<["--"], "print-prog-name">, Alias<print_prog_name_EQ>;
	def _profile_blocks : Flag<["--"], "profile-blocks">, Alias<a>;
	def _profile : Flag<["--"], "profile">, Alias<p>;
	def _resource_EQ : Joined<["--"], "resource=">, Alias<fcompile_resource_EQ>;
	def _resource : Separate<["--"], "resource">, Alias<fcompile_resource_EQ>;
	def _rtlib : Separate<["--"], "rtlib">, Alias<rtlib_EQ>;
	def _serialize_diags : Separate<["-", "--"], "serialize-diagnostics">, Flags<[DriverOption]>,
	HelpText<"Serialize compiler diagnostics to a file">;
	// We give --version different semantics from -version.
	def _version : Flag<["--"], "version">, Flags<[CoreOption, CC1Option]>,
	HelpText<"Print version information">;
	def _signed_char : Flag<["--"], "signed-char">, Alias<fsigned_char>;
	def _std : Separate<["--"], "std">, Alias<std_EQ>;
	def _stdlib : Separate<["--"], "stdlib">, Alias<stdlib_EQ>;
	def _sysroot_EQ : Joined<["--"], "sysroot=">;
	def _sysroot : Separate<["--"], "sysroot">, Alias<_sysroot_EQ>;
	def _target_help : Flag<["--"], "target-help">;
	def _trace_includes : Flag<["--"], "trace-includes">, Alias<H>;
	def _undefine_macro_EQ : Joined<["--"], "undefine-macro=">, Alias<U>;
	def _undefine_macro : Separate<["--"], "undefine-macro">, Alias<U>;
	def _unsigned_char : Flag<["--"], "unsigned-char">, Alias<funsigned_char>;
	def _user_dependencies : Flag<["--"], "user-dependencies">, Alias<MM>;
	def _verbose : Flag<["--"], "verbose">, Alias<v>;
	def _warn__EQ : Joined<["--"], "warn-=">, Alias<W_Joined>;
	def _warn_ : Joined<["--"], "warn-">, Alias<W_Joined>;
	def _write_dependencies : Flag<["--"], "write-dependencies">, Alias<MD>;
	def _write_user_dependencies : Flag<["--"], "write-user-dependencies">, Alias<MMD>;
	def _ : Joined<["--"], "">, Flags<[Unsupported]>;

	// Hexagon feature flags.
	def mieee_rnd_near : Flag<["-"], "mieee-rnd-near">,
	Group<m_hexagon_Features_Group>;
	def mv5 : Flag<["-"], "mv5">, Group<m_hexagon_Features_Group>, Alias<mcpu_EQ>,
	AliasArgs<["hexagonv5"]>;
	def mv55 : Flag<["-"], "mv55">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv55"]>;
	def mv60 : Flag<["-"], "mv60">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv60"]>;
	def mv62 : Flag<["-"], "mv62">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv62"]>;
	def mv65 : Flag<["-"], "mv65">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv65"]>;
	def mv66 : Flag<["-"], "mv66">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv66"]>;
	def mhexagon_hvx : Flag<["-"], "mhvx">, Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Vector eXtensions">;
	def mhexagon_hvx_EQ : Joined<["-"], "mhvx=">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Vector eXtensions">;
	def mno_hexagon_hvx : Flag<["-"], "mno-hvx">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Disable Hexagon Vector eXtensions">;
	def mhexagon_hvx_length_EQ : Joined<["-"], "mhvx-length=">,
	Group<m_hexagon_Features_HVX_Group>, HelpText<"Set Hexagon Vector Length">,
	Values<"64B,128B">;
	def ffixed_r19: Flag<["-"], "ffixed-r19">,
	HelpText<"Reserve register r19 (Hexagon only)">;
	def mmemops : Flag<["-"], "mmemops">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of memop instructions">;
	def mno_memops : Flag<["-"], "mno-memops">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of memop instructions">;
	def mpackets : Flag<["-"], "mpackets">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of instruction packets">;
	def mno_packets : Flag<["-"], "mno-packets">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of instruction packets">;
	def mnvj : Flag<["-"], "mnvj">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of new-value jumps">;
	def mno_nvj : Flag<["-"], "mno-nvj">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of new-value jumps">;
	def mnvs : Flag<["-"], "mnvs">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of new-value stores">;
	def mno_nvs : Flag<["-"], "mno-nvs">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of new-value stores">;


	// X86 feature flags
	def mx87 : Flag<["-"], "mx87">, Group<m_x86_Features_Group>;
	def mno_x87 : Flag<["-"], "mno-x87">, Group<m_x86_Features_Group>;
	def m80387 : Flag<["-"], "m80387">, Alias<mx87>;
	def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
	def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
	def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
	def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
	def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
	def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
	def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
	def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
	def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
	def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
	def mno_sse2 : Flag<["-"], "mno-sse2">, Group<m_x86_Features_Group>;
	def msse3 : Flag<["-"], "msse3">, Group<m_x86_Features_Group>;
	def mno_sse3 : Flag<["-"], "mno-sse3">, Group<m_x86_Features_Group>;
	def mssse3 : Flag<["-"], "mssse3">, Group<m_x86_Features_Group>;
	def mno_ssse3 : Flag<["-"], "mno-ssse3">, Group<m_x86_Features_Group>;
	def msse4_1 : Flag<["-"], "msse4.1">, Group<m_x86_Features_Group>;
	def mno_sse4_1 : Flag<["-"], "mno-sse4.1">, Group<m_x86_Features_Group>;
	def msse4_2 : Flag<["-"], "msse4.2">, Group<m_x86_Features_Group>;
	def mno_sse4_2 : Flag<["-"], "mno-sse4.2">, Group<m_x86_Features_Group>;
	def msse4 : Flag<["-"], "msse4">, Alias<msse4_2>;
	// -mno-sse4 turns off sse4.1 which has the effect of turning off everything
	// later than 4.1. -msse4 turns on 4.2 which has the effect of turning on
	// everything earlier than 4.2.
	def mno_sse4 : Flag<["-"], "mno-sse4">, Alias<mno_sse4_1>;
	def msse4a : Flag<["-"], "msse4a">, Group<m_x86_Features_Group>;
	def mno_sse4a : Flag<["-"], "mno-sse4a">, Group<m_x86_Features_Group>;
	def mavx : Flag<["-"], "mavx">, Group<m_x86_Features_Group>;
	def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
	def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
	def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
	def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
	def mno_avx512f : Flag<["-"], "mno-avx512f">, Group<m_x86_Features_Group>;
	def mavx512bf16 : Flag<["-"], "mavx512bf16">, Group<m_x86_Features_Group>;
	def mno_avx512bf16 : Flag<["-"], "mno-avx512bf16">, Group<m_x86_Features_Group>;
	def mavx512bitalg : Flag<["-"], "mavx512bitalg">, Group<m_x86_Features_Group>;
	def mno_avx512bitalg : Flag<["-"], "mno-avx512bitalg">, Group<m_x86_Features_Group>;
	def mavx512bw : Flag<["-"], "mavx512bw">, Group<m_x86_Features_Group>;
	def mno_avx512bw : Flag<["-"], "mno-avx512bw">, Group<m_x86_Features_Group>;
	def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
	def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
	def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
	def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
	def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
	def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
	def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
	def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
	def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
	def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
	def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
	def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi2 : Flag<["-"], "mno-avx512vbmi2">, Group<m_x86_Features_Group>;
	def mavx512vl : Flag<["-"], "mavx512vl">, Group<m_x86_Features_Group>;
	def mno_avx512vl : Flag<["-"], "mno-avx512vl">, Group<m_x86_Features_Group>;
	def mavx512vnni : Flag<["-"], "mavx512vnni">, Group<m_x86_Features_Group>;
	def mno_avx512vnni : Flag<["-"], "mno-avx512vnni">, Group<m_x86_Features_Group>;
	def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
	def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
	def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
	def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
	def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
	def mno_aes : Flag<["-"], "mno-aes">, Group<m_x86_Features_Group>;
	def mbmi : Flag<["-"], "mbmi">, Group<m_x86_Features_Group>;
	def mno_bmi : Flag<["-"], "mno-bmi">, Group<m_x86_Features_Group>;
	def mbmi2 : Flag<["-"], "mbmi2">, Group<m_x86_Features_Group>;
	def mno_bmi2 : Flag<["-"], "mno-bmi2">, Group<m_x86_Features_Group>;
	def mcldemote : Flag<["-"], "mcldemote">, Group<m_x86_Features_Group>;
	def mno_cldemote : Flag<["-"], "mno-cldemote">, Group<m_x86_Features_Group>;
	def mclflushopt : Flag<["-"], "mclflushopt">, Group<m_x86_Features_Group>;
	def mno_clflushopt : Flag<["-"], "mno-clflushopt">, Group<m_x86_Features_Group>;
	def mclwb : Flag<["-"], "mclwb">, Group<m_x86_Features_Group>;
	def mno_clwb : Flag<["-"], "mno-clwb">, Group<m_x86_Features_Group>;
	def mwbnoinvd : Flag<["-"], "mwbnoinvd">, Group<m_x86_Features_Group>;
	def mno_wbnoinvd : Flag<["-"], "mno-wbnoinvd">, Group<m_x86_Features_Group>;
	def mclzero : Flag<["-"], "mclzero">, Group<m_x86_Features_Group>;
	def mno_clzero : Flag<["-"], "mno-clzero">, Group<m_x86_Features_Group>;
	def mcx16 : Flag<["-"], "mcx16">, Group<m_x86_Features_Group>;
	def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
	def menqcmd : Flag<["-"], "menqcmd">, Group<m_x86_Features_Group>;
	def mno_enqcmd : Flag<["-"], "mno-enqcmd">, Group<m_x86_Features_Group>;
	def mf16c : Flag<["-"], "mf16c">, Group<m_x86_Features_Group>;
	def mno_f16c : Flag<["-"], "mno-f16c">, Group<m_x86_Features_Group>;
	def mfma : Flag<["-"], "mfma">, Group<m_x86_Features_Group>;
	def mno_fma : Flag<["-"], "mno-fma">, Group<m_x86_Features_Group>;
	def mfma4 : Flag<["-"], "mfma4">, Group<m_x86_Features_Group>;
	def mno_fma4 : Flag<["-"], "mno-fma4">, Group<m_x86_Features_Group>;
	def mfsgsbase : Flag<["-"], "mfsgsbase">, Group<m_x86_Features_Group>;
	def mno_fsgsbase : Flag<["-"], "mno-fsgsbase">, Group<m_x86_Features_Group>;
	def mfxsr : Flag<["-"], "mfxsr">, Group<m_x86_Features_Group>;
	def mno_fxsr : Flag<["-"], "mno-fxsr">, Group<m_x86_Features_Group>;
	def minvpcid : Flag<["-"], "minvpcid">, Group<m_x86_Features_Group>;
	def mno_invpcid : Flag<["-"], "mno-invpcid">, Group<m_x86_Features_Group>;
	def mgfni : Flag<["-"], "mgfni">, Group<m_x86_Features_Group>;
	def mno_gfni : Flag<["-"], "mno-gfni">, Group<m_x86_Features_Group>;
	def mlwp : Flag<["-"], "mlwp">, Group<m_x86_Features_Group>;
	def mno_lwp : Flag<["-"], "mno-lwp">, Group<m_x86_Features_Group>;
	def mlzcnt : Flag<["-"], "mlzcnt">, Group<m_x86_Features_Group>;
	def mno_lzcnt : Flag<["-"], "mno-lzcnt">, Group<m_x86_Features_Group>;
	def mmovbe : Flag<["-"], "mmovbe">, Group<m_x86_Features_Group>;
	def mno_movbe : Flag<["-"], "mno-movbe">, Group<m_x86_Features_Group>;
	def mmovdiri : Flag<["-"], "mmovdiri">, Group<m_x86_Features_Group>;
	def mno_movdiri : Flag<["-"], "mno-movdiri">, Group<m_x86_Features_Group>;
	def mmovdir64b : Flag<["-"], "mmovdir64b">, Group<m_x86_Features_Group>;
	def mno_movdir64b : Flag<["-"], "mno-movdir64b">, Group<m_x86_Features_Group>;
	def mmwaitx : Flag<["-"], "mmwaitx">, Group<m_x86_Features_Group>;
	def mno_mwaitx : Flag<["-"], "mno-mwaitx">, Group<m_x86_Features_Group>;
	def mpku : Flag<["-"], "mpku">, Group<m_x86_Features_Group>;
	def mno_pku : Flag<["-"], "mno-pku">, Group<m_x86_Features_Group>;
	def mpclmul : Flag<["-"], "mpclmul">, Group<m_x86_Features_Group>;
	def mno_pclmul : Flag<["-"], "mno-pclmul">, Group<m_x86_Features_Group>;
	def mpconfig : Flag<["-"], "mpconfig">, Group<m_x86_Features_Group>;
	def mno_pconfig : Flag<["-"], "mno-pconfig">, Group<m_x86_Features_Group>;
	def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
	def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
	def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
	def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
	def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
	def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
	def mptwrite : Flag<["-"], "mptwrite">, Group<m_x86_Features_Group>;
	def mno_ptwrite : Flag<["-"], "mno-ptwrite">, Group<m_x86_Features_Group>;
	def mrdpid : Flag<["-"], "mrdpid">, Group<m_x86_Features_Group>;
	def mno_rdpid : Flag<["-"], "mno-rdpid">, Group<m_x86_Features_Group>;
	def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
	def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
	def mrtm : Flag<["-"], "mrtm">, Group<m_x86_Features_Group>;
	def mno_rtm : Flag<["-"], "mno-rtm">, Group<m_x86_Features_Group>;
	def mrdseed : Flag<["-"], "mrdseed">, Group<m_x86_Features_Group>;
	def mno_rdseed : Flag<["-"], "mno-rdseed">, Group<m_x86_Features_Group>;
	def msahf : Flag<["-"], "msahf">, Group<m_x86_Features_Group>;
	def mno_sahf : Flag<["-"], "mno-sahf">, Group<m_x86_Features_Group>;
	def msgx : Flag<["-"], "msgx">, Group<m_x86_Features_Group>;
	def mno_sgx : Flag<["-"], "mno-sgx">, Group<m_x86_Features_Group>;
	def msha : Flag<["-"], "msha">, Group<m_x86_Features_Group>;
	def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
	def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
	def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
	def mvaes : Flag<["-"], "mvaes">, Group<m_x86_Features_Group>;
	def mno_vaes : Flag<["-"], "mno-vaes">, Group<m_x86_Features_Group>;
	def mvpclmulqdq : Flag<["-"], "mvpclmulqdq">, Group<m_x86_Features_Group>;
	def mno_vpclmulqdq : Flag<["-"], "mno-vpclmulqdq">, Group<m_x86_Features_Group>;
	def mwaitpkg : Flag<["-"], "mwaitpkg">, Group<m_x86_Features_Group>;
	def mno_waitpkg : Flag<["-"], "mno-waitpkg">, Group<m_x86_Features_Group>;
	def mxop : Flag<["-"], "mxop">, Group<m_x86_Features_Group>;
	def mno_xop : Flag<["-"], "mno-xop">, Group<m_x86_Features_Group>;
	def mxsave : Flag<["-"], "mxsave">, Group<m_x86_Features_Group>;
	def mno_xsave : Flag<["-"], "mno-xsave">, Group<m_x86_Features_Group>;
	def mxsavec : Flag<["-"], "mxsavec">, Group<m_x86_Features_Group>;
	def mno_xsavec : Flag<["-"], "mno-xsavec">, Group<m_x86_Features_Group>;
	def mxsaveopt : Flag<["-"], "mxsaveopt">, Group<m_x86_Features_Group>;
	def mno_xsaveopt : Flag<["-"], "mno-xsaveopt">, Group<m_x86_Features_Group>;
	def mxsaves : Flag<["-"], "mxsaves">, Group<m_x86_Features_Group>;
	def mno_xsaves : Flag<["-"], "mno-xsaves">, Group<m_x86_Features_Group>;
	def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
	def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
	def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<m_x86_Features_Group>;
	def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;
	def mvzeroupper : Flag<["-"], "mvzeroupper">, Group<m_x86_Features_Group>;
	def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group<m_x86_Features_Group>;

	// These are legacy user-facing driver-level option spellings. They are always
	// aliases for options that are spelled using the more common Unix / GNU flag
	// style of double-dash and equals-joined flags.
	def gcc_toolchain_legacy_spelling : Separate<["-"], "gcc-toolchain">, Alias<gcc_toolchain>;
	def target_legacy_spelling : Separate<["-"], "target">, Alias<target>;

	// Special internal option to handle -Xlinker --no-demangle.
	def Z_Xlinker__no_demangle : Flag<["-"], "Z-Xlinker-no-demangle">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Special internal option to allow forwarding arbitrary arguments to linker.
	def Zlinker_input : Separate<["-"], "Zlinker-input">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Reserved library options.
	def Z_reserved_lib_stdcxx : Flag<["-"], "Z-reserved-lib-stdc++">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
	def Z_reserved_lib_cckext : Flag<["-"], "Z-reserved-lib-cckext">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;

	// Ignored options
	// FIXME: multiclasess produce suffixes, not prefixes. This is fine for now
	// since it is only used in ignored options.
	multiclass BooleanFFlag<string name> {
	def _f : Flag<["-"], "f"#name>;
	def _fno : Flag<["-"], "fno-"#name>;
	}

	defm : BooleanFFlag<"keep-inline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;

	def fprofile_dir : Joined<["-"], "fprofile-dir=">, Group<f_Group>;

	def fuse_ld_EQ : Joined<["-"], "fuse-ld=">, Group<f_Group>, Flags<[CoreOption]>;

	defm align_labels : BooleanFFlag<"align-labels">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_loops : BooleanFFlag<"align-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_loops_EQ : Joined<["-"], "falign-loops=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_jumps : BooleanFFlag<"align-jumps">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_jumps_EQ : Joined<["-"], "falign-jumps=">, Group<clang_ignored_gcc_optimization_f_Group>;

	// FIXME: This option should be supported and wired up to our diognostics, but
	// ignore it for now to avoid breaking builds that use it.
	def fdiagnostics_show_location_EQ : Joined<["-"], "fdiagnostics-show-location=">, Group<clang_ignored_f_Group>;

	defm fcheck_new : BooleanFFlag<"check-new">, Group<clang_ignored_f_Group>;
	defm caller_saves : BooleanFFlag<"caller-saves">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm reorder_blocks : BooleanFFlag<"reorder-blocks">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm eliminate_unused_debug_types : BooleanFFlag<"eliminate-unused-debug-types">, Group<clang_ignored_f_Group>;
	defm branch_count_reg : BooleanFFlag<"branch-count-reg">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm default_inline : BooleanFFlag<"default-inline">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm fat_lto_objects : BooleanFFlag<"fat-lto-objects">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm float_store : BooleanFFlag<"float-store">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm friend_injection : BooleanFFlag<"friend-injection">, Group<clang_ignored_f_Group>;
	defm function_attribute_list : BooleanFFlag<"function-attribute-list">, Group<clang_ignored_f_Group>;
	defm gcse : BooleanFFlag<"gcse">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_after_reload: BooleanFFlag<"gcse-after-reload">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_las: BooleanFFlag<"gcse-las">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_sm: BooleanFFlag<"gcse-sm">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gnu : BooleanFFlag<"gnu">, Group<clang_ignored_f_Group>;
	defm implicit_templates : BooleanFFlag<"implicit-templates">, Group<clang_ignored_f_Group>;
	defm implement_inlines : BooleanFFlag<"implement-inlines">, Group<clang_ignored_f_Group>;
	defm merge_constants : BooleanFFlag<"merge-constants">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched : BooleanFFlag<"modulo-sched">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched_allow_regmoves : BooleanFFlag<"modulo-sched-allow-regmoves">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_functions_called_once : BooleanFFlag<"inline-functions-called-once">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def finline_limit_EQ : Joined<["-"], "finline-limit=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm finline_limit : BooleanFFlag<"inline-limit">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_small_functions : BooleanFFlag<"inline-small-functions">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ipa_cp : BooleanFFlag<"ipa-cp">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ivopts : BooleanFFlag<"ivopts">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-semantic-interposition">, Group<clang_ignored_f_Group>;
	defm non_call_exceptions : BooleanFFlag<"non-call-exceptions">, Group<clang_ignored_f_Group>;
	defm peel_loops : BooleanFFlag<"peel-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm permissive : BooleanFFlag<"permissive">, Group<clang_ignored_f_Group>;
	defm prefetch_loop_arrays : BooleanFFlag<"prefetch-loop-arrays">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm printf : BooleanFFlag<"printf">, Group<clang_ignored_f_Group>;
	defm profile : BooleanFFlag<"profile">, Group<clang_ignored_f_Group>;
	defm profile_correction : BooleanFFlag<"profile-correction">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm profile_generate_sampling : BooleanFFlag<"profile-generate-sampling">, Group<clang_ignored_f_Group>;
	defm profile_reusedist : BooleanFFlag<"profile-reusedist">, Group<clang_ignored_f_Group>;
	defm profile_values : BooleanFFlag<"profile-values">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm regs_graph : BooleanFFlag<"regs-graph">, Group<clang_ignored_f_Group>;
	defm rename_registers : BooleanFFlag<"rename-registers">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm ripa : BooleanFFlag<"ripa">, Group<clang_ignored_f_Group>;
	defm schedule_insns : BooleanFFlag<"schedule-insns">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm schedule_insns2 : BooleanFFlag<"schedule-insns2">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm see : BooleanFFlag<"see">, Group<clang_ignored_f_Group>;
	defm signaling_nans : BooleanFFlag<"signaling-nans">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm single_precision_constant : BooleanFFlag<"single-precision-constant">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm spec_constr_count : BooleanFFlag<"spec-constr-count">, Group<clang_ignored_f_Group>;
	defm stack_check : BooleanFFlag<"stack-check">, Group<clang_ignored_f_Group>;
	defm strength_reduce :
	BooleanFFlag<"strength-reduce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tls_model : BooleanFFlag<"tls-model">, Group<clang_ignored_f_Group>;
	defm tracer : BooleanFFlag<"tracer">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_dce : BooleanFFlag<"tree-dce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_im : BooleanFFlag<"tree_loop_im">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_ivcanon : BooleanFFlag<"tree_loop_ivcanon">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_linear : BooleanFFlag<"tree_loop_linear">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_salias : BooleanFFlag<"tree-salias">, Group<clang_ignored_f_Group>;
	defm tree_ter : BooleanFFlag<"tree-ter">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_vectorizer_verbose : BooleanFFlag<"tree-vectorizer-verbose">, Group<clang_ignored_f_Group>;
	defm tree_vrp : BooleanFFlag<"tree-vrp">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unroll_all_loops : BooleanFFlag<"unroll-all-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unsafe_loop_optimizations : BooleanFFlag<"unsafe-loop-optimizations">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm unswitch_loops : BooleanFFlag<"unswitch-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm use_linker_plugin : BooleanFFlag<"use-linker-plugin">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm vect_cost_model : BooleanFFlag<"vect-cost-model">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm variable_expansion_in_unroller : BooleanFFlag<"variable-expansion-in-unroller">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm web : BooleanFFlag<"web">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm whole_program : BooleanFFlag<"whole-program">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize : BooleanFFlag<"devirtualize">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize_speculatively : BooleanFFlag<"devirtualize-speculatively">,
	Group<clang_ignored_gcc_optimization_f_Group>;

	// Generic gfortran options.
	def A_DASH : Joined<["-"], "A-">, Group<gfortran_Group>;
	def J : JoinedOrSeparate<["-"], "J">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>;
	def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>;
	def static_libgfortran : Flag<["-"], "static-libgfortran">, Group<gfortran_Group>;

	// "f" options with values for gfortran.
	def fblas_matmul_limit_EQ : Joined<["-"], "fblas-matmul-limit=">, Group<gfortran_Group>;
	def fcheck_EQ : Joined<["-"], "fcheck=">, Group<gfortran_Group>;
	def fcoarray_EQ : Joined<["-"], "fcoarray=">, Group<gfortran_Group>;
	def fconvert_EQ : Joined<["-"], "fconvert=">, Group<gfortran_Group>;
	def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>;
	def ffpe_trap_EQ : Joined<["-"], "ffpe-trap=">, Group<gfortran_Group>;
	def ffree_line_length_VALUE : Joined<["-"], "ffree-line-length-">, Group<gfortran_Group>;
	def finit_character_EQ : Joined<["-"], "finit-character=">, Group<gfortran_Group>;
	def finit_integer_EQ : Joined<["-"], "finit-integer=">, Group<gfortran_Group>;
	def finit_logical_EQ : Joined<["-"], "finit-logical=">, Group<gfortran_Group>;
	def finit_real_EQ : Joined<["-"], "finit-real=">, Group<gfortran_Group>;
	def fmax_array_constructor_EQ : Joined<["-"], "fmax-array-constructor=">, Group<gfortran_Group>;
	def fmax_errors_EQ : Joined<["-"], "fmax-errors=">, Group<gfortran_Group>;
	def fmax_stack_var_size_EQ : Joined<["-"], "fmax-stack-var-size=">, Group<gfortran_Group>;
	def fmax_subrecord_length_EQ : Joined<["-"], "fmax-subrecord-length=">, Group<gfortran_Group>;
	def frecord_marker_EQ : Joined<["-"], "frecord-marker=">, Group<gfortran_Group>;

	// "f" flags for gfortran.
	defm aggressive_function_elimination : BooleanFFlag<"aggressive-function-elimination">, Group<gfortran_Group>;
	defm align_commons : BooleanFFlag<"align-commons">, Group<gfortran_Group>;
	defm all_intrinsics : BooleanFFlag<"all-intrinsics">, Group<gfortran_Group>;
	defm automatic : BooleanFFlag<"automatic">, Group<gfortran_Group>;
	defm backslash : BooleanFFlag<"backslash">, Group<gfortran_Group>;
	defm backtrace : BooleanFFlag<"backtrace">, Group<gfortran_Group>;
	defm bounds_check : BooleanFFlag<"bounds-check">, Group<gfortran_Group>;
	defm check_array_temporaries : BooleanFFlag<"check-array-temporaries">, Group<gfortran_Group>;
	defm cray_pointer : BooleanFFlag<"cray-pointer">, Group<gfortran_Group>;
	defm d_lines_as_code : BooleanFFlag<"d-lines-as-code">, Group<gfortran_Group>;
	defm d_lines_as_comments : BooleanFFlag<"d-lines-as-comments">, Group<gfortran_Group>;
	defm default_double_8 : BooleanFFlag<"default-double-8">, Group<gfortran_Group>;
	defm default_integer_8 : BooleanFFlag<"default-integer-8">, Group<gfortran_Group>;
	defm default_real_8 : BooleanFFlag<"default-real-8">, Group<gfortran_Group>;
	defm dollar_ok : BooleanFFlag<"dollar-ok">, Group<gfortran_Group>;
	defm dump_fortran_optimized : BooleanFFlag<"dump-fortran-optimized">, Group<gfortran_Group>;
	defm dump_fortran_original : BooleanFFlag<"dump-fortran-original">, Group<gfortran_Group>;
	defm dump_parse_tree : BooleanFFlag<"dump-parse-tree">, Group<gfortran_Group>;
	defm external_blas : BooleanFFlag<"external-blas">, Group<gfortran_Group>;
	defm f2c : BooleanFFlag<"f2c">, Group<gfortran_Group>;
	defm fixed_form : BooleanFFlag<"fixed-form">, Group<gfortran_Group>;
	defm free_form : BooleanFFlag<"free-form">, Group<gfortran_Group>;
	defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<gfortran_Group>;
	defm implicit_none : BooleanFFlag<"implicit-none">, Group<gfortran_Group>;
	defm init_local_zero : BooleanFFlag<"init-local-zero">, Group<gfortran_Group>;
	defm integer_4_integer_8 : BooleanFFlag<"integer-4-integer-8">, Group<gfortran_Group>;
	defm intrinsic_modules_path : BooleanFFlag<"intrinsic-modules-path">, Group<gfortran_Group>;
	defm max_identifier_length : BooleanFFlag<"max-identifier-length">, Group<gfortran_Group>;
	defm module_private : BooleanFFlag<"module-private">, Group<gfortran_Group>;
	defm pack_derived : BooleanFFlag<"pack-derived">, Group<gfortran_Group>;
	defm protect_parens : BooleanFFlag<"protect-parens">, Group<gfortran_Group>;
	defm range_check : BooleanFFlag<"range-check">, Group<gfortran_Group>;
	defm real_4_real_10 : BooleanFFlag<"real-4-real-10">, Group<gfortran_Group>;
	defm real_4_real_16 : BooleanFFlag<"real-4-real-16">, Group<gfortran_Group>;
	defm real_4_real_8 : BooleanFFlag<"real-4-real-8">, Group<gfortran_Group>;
	defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>;
	defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>;
	defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>;
	defm realloc_lhs : BooleanFFlag<"realloc-lhs">, Group<gfortran_Group>;
	defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>;
	defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>;
	defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>;
	defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>;
	defm stack_arrays : BooleanFFlag<"stack-arrays">, Group<gfortran_Group>;
	defm underscoring : BooleanFFlag<"underscoring">, Group<gfortran_Group>;
	defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;


	include "CC1Options.td"

	include "CLCompatOptions.td"
	Index: head/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h
	===================================================================
	--- head/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h (revision 362608)
	+++ head/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h (revision 362609)
	@@ -1,475 +1,476 @@
	//===--- PPC.h - Declare PPC target feature support -------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares PPC TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_PPC_H
	#define LLVM_CLANG_LIB_BASIC_TARGETS_PPC_H

	#include "OSTargets.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TargetOptions.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/Compiler.h"

	namespace clang {
	namespace targets {

	// PPC abstract base class
	class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {

	/// Flags for architecture specific defines.
	typedef enum {
	ArchDefineNone = 0,
	ArchDefineName = 1 << 0, // <name> is substituted for arch name.
	ArchDefinePpcgr = 1 << 1,
	ArchDefinePpcsq = 1 << 2,
	ArchDefine440 = 1 << 3,
	ArchDefine603 = 1 << 4,
	ArchDefine604 = 1 << 5,
	ArchDefinePwr4 = 1 << 6,
	ArchDefinePwr5 = 1 << 7,
	ArchDefinePwr5x = 1 << 8,
	ArchDefinePwr6 = 1 << 9,
	ArchDefinePwr6x = 1 << 10,
	ArchDefinePwr7 = 1 << 11,
	ArchDefinePwr8 = 1 << 12,
	ArchDefinePwr9 = 1 << 13,
	ArchDefineFuture = 1 << 14,
	ArchDefineA2 = 1 << 15,
	ArchDefineA2q = 1 << 16,
	ArchDefineE500 = 1 << 17
	} ArchDefineTypes;


	ArchDefineTypes ArchDefs = ArchDefineNone;
	static const Builtin::Info BuiltinInfo[];
	static const char *const GCCRegNames[];
	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	std::string CPU;
	enum PPCFloatABI { HardFloat, SoftFloat } FloatABI;

	// Target cpu features.
	bool HasAltivec = false;
	bool HasVSX = false;
	bool HasP8Vector = false;
	bool HasP8Crypto = false;
	bool HasDirectMove = false;
	bool HasQPX = false;
	bool HasHTM = false;
	bool HasBPERMD = false;
	bool HasExtDiv = false;
	bool HasP9Vector = false;
	bool HasSPE = false;

	protected:
	std::string ABI;

	public:
	PPCTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	SuitableAlign = 128;
	SimdDefaultAlign = 128;
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::PPCDoubleDouble();
	}

	// Set the language option for altivec based on our value.
	void adjust(LangOptions &Opts) override;

	// Note: GCC recognizes the following additional cpus:
	// 401, 403, 405, 405fp, 440fp, 464, 464fp, 476, 476fp, 505, 740, 801,
	// 821, 823, 8540, e300c2, e300c3, e500mc64, e6500, 860, cell, titan, rs64.
	bool isValidCPUName(StringRef Name) const override;
	void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;

	bool setCPU(const std::string &Name) override {
	bool CPUKnown = isValidCPUName(Name);
	if (CPUKnown) {
	CPU = Name;

	// CPU identification.
	ArchDefs =
	(ArchDefineTypes)llvm::StringSwitch<int>(CPU)
	.Case("440", ArchDefineName)
	.Case("450", ArchDefineName \| ArchDefine440)
	.Case("601", ArchDefineName)
	.Case("602", ArchDefineName \| ArchDefinePpcgr)
	.Case("603", ArchDefineName \| ArchDefinePpcgr)
	.Case("603e", ArchDefineName \| ArchDefine603 \| ArchDefinePpcgr)
	.Case("603ev", ArchDefineName \| ArchDefine603 \| ArchDefinePpcgr)
	.Case("604", ArchDefineName \| ArchDefinePpcgr)
	.Case("604e", ArchDefineName \| ArchDefine604 \| ArchDefinePpcgr)
	.Case("620", ArchDefineName \| ArchDefinePpcgr)
	.Case("630", ArchDefineName \| ArchDefinePpcgr)
	.Case("7400", ArchDefineName \| ArchDefinePpcgr)
	.Case("7450", ArchDefineName \| ArchDefinePpcgr)
	.Case("750", ArchDefineName \| ArchDefinePpcgr)
	.Case("970", ArchDefineName \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("a2", ArchDefineA2)
	.Case("a2q", ArchDefineName \| ArchDefineA2 \| ArchDefineA2q)
	.Cases("power3", "pwr3", ArchDefinePpcgr)
	.Cases("power4", "pwr4",
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power5", "pwr5",
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Cases("power5x", "pwr5x",
	ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power6", "pwr6",
	ArchDefinePwr6 \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power6x", "pwr6x",
	ArchDefinePwr6x \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Cases("power7", "pwr7",
	ArchDefinePwr7 \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	// powerpc64le automatically defaults to at least power8.
	.Cases("power8", "pwr8", "ppc64le",
	ArchDefinePwr8 \| ArchDefinePwr7 \| ArchDefinePwr6 \|
	ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power9", "pwr9",
	ArchDefinePwr9 \| ArchDefinePwr8 \| ArchDefinePwr7 \|
	ArchDefinePwr6 \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("future",
	ArchDefineFuture \| ArchDefinePwr9 \| ArchDefinePwr8 \|
	ArchDefinePwr7 \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Cases("8548", "e500", ArchDefineE500)
	.Default(ArchDefineNone);
	}
	return CPUKnown;
	}

	StringRef getABI() const override { return ABI; }

	ArrayRef<Builtin::Info> getTargetBuiltins() const override;

	bool isCLZForZeroUndef() const override { return false; }

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override;

	void addFutureSpecificFeatures(llvm::StringMap<bool> &Features) const;

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;

	bool hasFeature(StringRef Feature) const override;

	void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
	bool Enabled) const override;

	ArrayRef<const char *> getGCCRegNames() const override;

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;

	ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default:
	return false;
	case 'O': // Zero
	break;
	case 'f': // Floating point register
	// Don't use floating point registers on soft float ABI.
	if (FloatABI == SoftFloat)
	return false;
	LLVM_FALLTHROUGH;
	case 'b': // Base register
	Info.setAllowsRegister();
	break;
	// FIXME: The following are added to allow parsing.
	// I just took a guess at what the actions should be.
	// Also, is more specific checking needed? I.e. specific registers?
	case 'd': // Floating point register (containing 64-bit value)
	case 'v': // Altivec vector register
	// Don't use floating point and altivec vector registers
	// on soft float ABI
	if (FloatABI == SoftFloat)
	return false;
	Info.setAllowsRegister();
	break;
	case 'w':
	switch (Name[1]) {
	case 'd': // VSX vector register to hold vector double data
	case 'f': // VSX vector register to hold vector float data
	case 's': // VSX vector register to hold scalar double data
	case 'w': // VSX vector register to hold scalar double data
	case 'a': // Any VSX register
	case 'c': // An individual CR bit
	case 'i': // FP or VSX register to hold 64-bit integers data
	break;
	default:
	return false;
	}
	Info.setAllowsRegister();
	Name++; // Skip over 'w'.
	break;
	case 'h': // `MQ', `CTR', or `LINK' register
	case 'q': // `MQ' register
	case 'c': // `CTR' register
	case 'l': // `LINK' register
	case 'x': // `CR' register (condition register) number 0
	case 'y': // `CR' register (condition register)
	case 'z': // `XER[CA]' carry bit (part of the XER register)
	Info.setAllowsRegister();
	break;
	case 'I': // Signed 16-bit constant
	case 'J': // Unsigned 16-bit constant shifted left 16 bits
	// (use `L' instead for SImode constants)
	case 'K': // Unsigned 16-bit constant
	case 'L': // Signed 16-bit constant shifted left 16 bits
	case 'M': // Constant larger than 31
	case 'N': // Exact power of 2
	case 'P': // Constant whose negation is a signed 16-bit constant
	case 'G': // Floating point constant that can be loaded into a
	// register with one instruction per word
	case 'H': // Integer/Floating point constant that can be loaded
	// into a register using three instructions
	break;
	case 'm': // Memory operand. Note that on PowerPC targets, m can
	// include addresses that update the base register. It
	// is therefore only safe to use `m' in an asm statement
	// if that asm statement accesses the operand exactly once.
	// The asm statement must also use `%U<opno>' as a
	// placeholder for the "update" flag in the corresponding
	// load or store instruction. For example:
	// asm ("st%U0 %1,%0" : "=m" (mem) : "r" (val));
	// is correct but:
	// asm ("st %1,%0" : "=m" (mem) : "r" (val));
	// is not. Use es rather than m if you don't want the base
	// register to be updated.
	case 'e':
	if (Name[1] != 's')
	return false;
	// es: A "stable" memory operand; that is, one which does not
	// include any automodification of the base register. Unlike
	// `m', this constraint can be used in asm statements that
	// might access the operand several times, or that might not
	// access it at all.
	Info.setAllowsMemory();
	Name++; // Skip over 'e'.
	break;
	case 'Q': // Memory operand that is an offset from a register (it is
	// usually better to use `m' or `es' in asm statements)
	+ Info.setAllowsRegister();
	+ LLVM_FALLTHROUGH;
	case 'Z': // Memory operand that is an indexed or indirect from a
	// register (it is usually better to use `m' or `es' in
	// asm statements)
	Info.setAllowsMemory();
	- Info.setAllowsRegister();
	break;
	case 'R': // AIX TOC entry
	case 'a': // Address operand that is an indexed or indirect from a
	// register (`p' is preferable for asm statements)
	case 'S': // Constant suitable as a 64-bit mask operand
	case 'T': // Constant suitable as a 32-bit mask operand
	case 'U': // System V Release 4 small data area reference
	case 't': // AND masks that can be performed by two rldic{l, r}
	// instructions
	case 'W': // Vector constant that does not require memory
	case 'j': // Vector constant that is all zeros.
	break;
	// End FIXME.
	}
	return true;
	}

	std::string convertConstraint(const char *&Constraint) const override {
	std::string R;
	switch (*Constraint) {
	case 'e':
	case 'w':
	// Two-character constraint; add "^" hint for later parsing.
	R = std::string("^") + std::string(Constraint, 2);
	Constraint++;
	break;
	default:
	return TargetInfo::convertConstraint(Constraint);
	}
	return R;
	}

	const char *getClobbers() const override { return ""; }
	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0)
	return 3;
	if (RegNo == 1)
	return 4;
	return -1;
	}

	bool hasSjLjLowering() const override { return true; }

	const char *getLongDoubleMangling() const override {
	if (LongDoubleWidth == 64)
	return "e";
	return LongDoubleFormat == &llvm::APFloat::PPCDoubleDouble()
	? "g"
	: "u9__ieee128";
	}
	const char *getFloat128Mangling() const override { return "u9__ieee128"; }
	};

	class LLVM_LIBRARY_VISIBILITY PPC32TargetInfo : public PPCTargetInfo {
	public:
	PPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: PPCTargetInfo(Triple, Opts) {
	resetDataLayout("E-m:e-p:32:32-i64:64-n32");

	switch (getTriple().getOS()) {
	case llvm::Triple::Linux:
	case llvm::Triple::FreeBSD:
	case llvm::Triple::NetBSD:
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	break;
	case llvm::Triple::AIX:
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	SuitableAlign = 64;
	break;
	default:
	break;
	}

	if (Triple.isOSFreeBSD() \|\| Triple.isOSNetBSD() \|\| Triple.isOSOpenBSD() \|\|
	Triple.getOS() == llvm::Triple::AIX \|\| Triple.isMusl()) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	// PPC32 supports atomics up to 4 bytes.
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	// This is the ELF definition, and is overridden by the Darwin sub-target
	return TargetInfo::PowerABIBuiltinVaList;
	}
	};

	// Note: ABI differences may eventually require us to have a separate
	// TargetInfo for little endian.
	class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
	public:
	PPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: PPCTargetInfo(Triple, Opts) {
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	IntMaxType = SignedLong;
	Int64Type = SignedLong;

	if ((Triple.getArch() == llvm::Triple::ppc64le)) {
	resetDataLayout("e-m:e-i64:64-n32:64");
	ABI = "elfv2";
	} else {
	resetDataLayout("E-m:e-i64:64-n32:64");
	ABI = "elfv1";
	}

	if (Triple.getOS() == llvm::Triple::AIX)
	SuitableAlign = 64;

	if (Triple.isOSFreeBSD() \|\| Triple.getOS() == llvm::Triple::AIX \|\|
	Triple.isMusl()) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	// PPC64 supports atomics up to 8 bytes.
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	// PPC64 Linux-specific ABI options.
	bool setABI(const std::string &Name) override {
	if (Name == "elfv1" \|\| Name == "elfv1-qpx" \|\| Name == "elfv2") {
	ABI = Name;
	return true;
	}
	return false;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_Swift:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}
	};

	class LLVM_LIBRARY_VISIBILITY DarwinPPC32TargetInfo
	: public DarwinTargetInfo<PPC32TargetInfo> {
	public:
	DarwinPPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<PPC32TargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	BoolWidth = BoolAlign = 32; // XXX support -mone-byte-bool?
	PtrDiffType = SignedInt; // for http://llvm.org/bugs/show_bug.cgi?id=15726
	LongLongAlign = 32;
	resetDataLayout("E-m:o-p:32:32-f64:32:64-n32");
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	class LLVM_LIBRARY_VISIBILITY DarwinPPC64TargetInfo
	: public DarwinTargetInfo<PPC64TargetInfo> {
	public:
	DarwinPPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<PPC64TargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	resetDataLayout("E-m:o-i64:64-n32:64");
	}
	};

	class LLVM_LIBRARY_VISIBILITY AIXPPC32TargetInfo :
	public AIXTargetInfo<PPC32TargetInfo> {
	public:
	using AIXTargetInfo::AIXTargetInfo;
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	class LLVM_LIBRARY_VISIBILITY AIXPPC64TargetInfo :
	public AIXTargetInfo<PPC64TargetInfo> {
	public:
	using AIXTargetInfo::AIXTargetInfo;
	};

	} // namespace targets
	} // namespace clang
	#endif // LLVM_CLANG_LIB_BASIC_TARGETS_PPC_H
	Index: head/contrib/llvm-project/clang/lib/Driver/SanitizerArgs.cpp
	===================================================================
	--- head/contrib/llvm-project/clang/lib/Driver/SanitizerArgs.cpp (revision 362608)
	+++ head/contrib/llvm-project/clang/lib/Driver/SanitizerArgs.cpp (revision 362609)
	@@ -1,1166 +1,1165 @@
	//===--- SanitizerArgs.cpp - Arguments for sanitizer tools ---------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	#include "clang/Driver/SanitizerArgs.h"
	#include "ToolChains/CommonArgs.h"
	#include "clang/Basic/Sanitizers.h"
	#include "clang/Driver/Driver.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Options.h"
	#include "clang/Driver/ToolChain.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/SpecialCaseList.h"
	#include "llvm/Support/TargetParser.h"
	#include <memory>

	using namespace clang;
	using namespace clang::driver;
	using namespace llvm::opt;

	static const SanitizerMask NeedsUbsanRt =
	SanitizerKind::Undefined \| SanitizerKind::Integer \|
	SanitizerKind::ImplicitConversion \| SanitizerKind::Nullability \|
	SanitizerKind::CFI \| SanitizerKind::FloatDivideByZero;
	static const SanitizerMask NeedsUbsanCxxRt =
	SanitizerKind::Vptr \| SanitizerKind::CFI;
	static const SanitizerMask NotAllowedWithTrap = SanitizerKind::Vptr;
	static const SanitizerMask NotAllowedWithMinimalRuntime =
	SanitizerKind::Function \| SanitizerKind::Vptr;
	static const SanitizerMask RequiresPIE =
	SanitizerKind::DataFlow \| SanitizerKind::HWAddress \| SanitizerKind::Scudo;
	static const SanitizerMask NeedsUnwindTables =
	SanitizerKind::Address \| SanitizerKind::HWAddress \| SanitizerKind::Thread \|
	SanitizerKind::Memory \| SanitizerKind::DataFlow;
	static const SanitizerMask SupportsCoverage =
	SanitizerKind::Address \| SanitizerKind::HWAddress \|
	SanitizerKind::KernelAddress \| SanitizerKind::KernelHWAddress \|
	SanitizerKind::MemTag \| SanitizerKind::Memory \|
	SanitizerKind::KernelMemory \| SanitizerKind::Leak \|
	SanitizerKind::Undefined \| SanitizerKind::Integer \|
	SanitizerKind::ImplicitConversion \| SanitizerKind::Nullability \|
	SanitizerKind::DataFlow \| SanitizerKind::Fuzzer \|
	SanitizerKind::FuzzerNoLink \| SanitizerKind::FloatDivideByZero \|
	SanitizerKind::SafeStack \| SanitizerKind::ShadowCallStack;
	static const SanitizerMask RecoverableByDefault =
	SanitizerKind::Undefined \| SanitizerKind::Integer \|
	SanitizerKind::ImplicitConversion \| SanitizerKind::Nullability \|
	SanitizerKind::FloatDivideByZero;
	static const SanitizerMask Unrecoverable =
	SanitizerKind::Unreachable \| SanitizerKind::Return;
	static const SanitizerMask AlwaysRecoverable =
	SanitizerKind::KernelAddress \| SanitizerKind::KernelHWAddress;
	static const SanitizerMask LegacyFsanitizeRecoverMask =
	SanitizerKind::Undefined \| SanitizerKind::Integer;
	static const SanitizerMask NeedsLTO = SanitizerKind::CFI;
	static const SanitizerMask TrappingSupported =
	(SanitizerKind::Undefined & ~SanitizerKind::Vptr) \|
	SanitizerKind::UnsignedIntegerOverflow \| SanitizerKind::ImplicitConversion \|
	SanitizerKind::Nullability \| SanitizerKind::LocalBounds \|
	SanitizerKind::CFI \| SanitizerKind::FloatDivideByZero;
	static const SanitizerMask TrappingDefault = SanitizerKind::CFI;
	static const SanitizerMask CFIClasses =
	SanitizerKind::CFIVCall \| SanitizerKind::CFINVCall \|
	SanitizerKind::CFIMFCall \| SanitizerKind::CFIDerivedCast \|
	SanitizerKind::CFIUnrelatedCast;
	static const SanitizerMask CompatibleWithMinimalRuntime =
	TrappingSupported \| SanitizerKind::Scudo \| SanitizerKind::ShadowCallStack;

	enum CoverageFeature {
	CoverageFunc = 1 << 0,
	CoverageBB = 1 << 1,
	CoverageEdge = 1 << 2,
	CoverageIndirCall = 1 << 3,
	CoverageTraceBB = 1 << 4, // Deprecated.
	CoverageTraceCmp = 1 << 5,
	CoverageTraceDiv = 1 << 6,
	CoverageTraceGep = 1 << 7,
	Coverage8bitCounters = 1 << 8, // Deprecated.
	CoverageTracePC = 1 << 9,
	CoverageTracePCGuard = 1 << 10,
	CoverageNoPrune = 1 << 11,
	CoverageInline8bitCounters = 1 << 12,
	CoveragePCTable = 1 << 13,
	CoverageStackDepth = 1 << 14,
	};

	/// Parse a -fsanitize= or -fno-sanitize= argument's values, diagnosing any
	/// invalid components. Returns a SanitizerMask.
	static SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A,
	bool DiagnoseErrors);

	/// Parse -f(no-)?sanitize-coverage= flag values, diagnosing any invalid
	/// components. Returns OR of members of \c CoverageFeature enumeration.
	static int parseCoverageFeatures(const Driver &D, const llvm::opt::Arg *A);

	/// Produce an argument string from ArgList \p Args, which shows how it
	/// provides some sanitizer kind from \p Mask. For example, the argument list
	/// "-fsanitize=thread,vptr -fsanitize=address" with mask \c NeedsUbsanRt
	/// would produce "-fsanitize=vptr".
	static std::string lastArgumentForMask(const Driver &D,
	const llvm::opt::ArgList &Args,
	SanitizerMask Mask);

	/// Produce an argument string from argument \p A, which shows how it provides
	/// a value in \p Mask. For instance, the argument
	/// "-fsanitize=address,alignment" with mask \c NeedsUbsanRt would produce
	/// "-fsanitize=alignment".
	static std::string describeSanitizeArg(const llvm::opt::Arg *A,
	SanitizerMask Mask);

	/// Produce a string containing comma-separated names of sanitizers in \p
	/// Sanitizers set.
	static std::string toString(const clang::SanitizerSet &Sanitizers);

	static void addDefaultBlacklists(const Driver &D, SanitizerMask Kinds,
	std::vector<std::string> &BlacklistFiles) {
	struct Blacklist {
	const char *File;
	SanitizerMask Mask;
	} Blacklists[] = {{"asan_blacklist.txt", SanitizerKind::Address},
	{"hwasan_blacklist.txt", SanitizerKind::HWAddress},
	{"memtag_blacklist.txt", SanitizerKind::MemTag},
	{"msan_blacklist.txt", SanitizerKind::Memory},
	{"tsan_blacklist.txt", SanitizerKind::Thread},
	{"dfsan_abilist.txt", SanitizerKind::DataFlow},
	{"cfi_blacklist.txt", SanitizerKind::CFI},
	{"ubsan_blacklist.txt",
	SanitizerKind::Undefined \| SanitizerKind::Integer \|
	SanitizerKind::Nullability \|
	SanitizerKind::FloatDivideByZero}};

	for (auto BL : Blacklists) {
	if (!(Kinds & BL.Mask))
	continue;

	clang::SmallString<64> Path(D.ResourceDir);
	llvm::sys::path::append(Path, "share", BL.File);
	if (D.getVFS().exists(Path))
	BlacklistFiles.push_back(Path.str());
	else if (BL.Mask == SanitizerKind::CFI)
	// If cfi_blacklist.txt cannot be found in the resource dir, driver
	// should fail.
	D.Diag(clang::diag::err_drv_no_such_file) << Path;
	}
	}

	/// Sets group bits for every group that has at least one representative already
	/// enabled in \p Kinds.
	static SanitizerMask setGroupBits(SanitizerMask Kinds) {
	#define SANITIZER(NAME, ID)
	#define SANITIZER_GROUP(NAME, ID, ALIAS) \
	if (Kinds & SanitizerKind::ID) \
	Kinds \|= SanitizerKind::ID##Group;
	#include "clang/Basic/Sanitizers.def"
	return Kinds;
	}

	static SanitizerMask parseSanitizeTrapArgs(const Driver &D,
	const llvm::opt::ArgList &Args) {
	SanitizerMask TrapRemove; // During the loop below, the accumulated set of
	// sanitizers disabled by the current sanitizer
	// argument or any argument after it.
	SanitizerMask TrappingKinds;
	SanitizerMask TrappingSupportedWithGroups = setGroupBits(TrappingSupported);

	for (ArgList::const_reverse_iterator I = Args.rbegin(), E = Args.rend();
	I != E; ++I) {
	const auto Arg = I;
	if (Arg->getOption().matches(options::OPT_fsanitize_trap_EQ)) {
	Arg->claim();
	SanitizerMask Add = parseArgValues(D, Arg, true);
	Add &= ~TrapRemove;
	if (SanitizerMask InvalidValues = Add & ~TrappingSupportedWithGroups) {
	SanitizerSet S;
	S.Mask = InvalidValues;
	D.Diag(diag::err_drv_unsupported_option_argument) << "-fsanitize-trap"
	<< toString(S);
	}
	TrappingKinds \|= expandSanitizerGroups(Add) & ~TrapRemove;
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_trap_EQ)) {
	Arg->claim();
	TrapRemove \|= expandSanitizerGroups(parseArgValues(D, Arg, true));
	} else if (Arg->getOption().matches(
	options::OPT_fsanitize_undefined_trap_on_error)) {
	Arg->claim();
	TrappingKinds \|=
	expandSanitizerGroups(SanitizerKind::UndefinedGroup & ~TrapRemove) &
	~TrapRemove;
	} else if (Arg->getOption().matches(
	options::OPT_fno_sanitize_undefined_trap_on_error)) {
	Arg->claim();
	TrapRemove \|= expandSanitizerGroups(SanitizerKind::UndefinedGroup);
	}
	}

	// Apply default trapping behavior.
	TrappingKinds \|= TrappingDefault & ~TrapRemove;

	return TrappingKinds;
	}

	bool SanitizerArgs::needsUbsanRt() const {
	// All of these include ubsan.
	if (needsAsanRt() \|\| needsMsanRt() \|\| needsHwasanRt() \|\| needsTsanRt() \|\|
	needsDfsanRt() \|\| needsLsanRt() \|\| needsCfiDiagRt() \|\|
	(needsScudoRt() && !requiresMinimalRuntime()))
	return false;

	return (Sanitizers.Mask & NeedsUbsanRt & ~TrapSanitizers.Mask) \|\|
	CoverageFeatures;
	}

	bool SanitizerArgs::needsCfiRt() const {
	return !(Sanitizers.Mask & SanitizerKind::CFI & ~TrapSanitizers.Mask) &&
	CfiCrossDso && !ImplicitCfiRuntime;
	}

	bool SanitizerArgs::needsCfiDiagRt() const {
	return (Sanitizers.Mask & SanitizerKind::CFI & ~TrapSanitizers.Mask) &&
	CfiCrossDso && !ImplicitCfiRuntime;
	}

	bool SanitizerArgs::requiresPIE() const {
	return NeedPIE \|\| (Sanitizers.Mask & RequiresPIE);
	}

	bool SanitizerArgs::needsUnwindTables() const {
	return static_cast<bool>(Sanitizers.Mask & NeedsUnwindTables);
	}

	bool SanitizerArgs::needsLTO() const {
	return static_cast<bool>(Sanitizers.Mask & NeedsLTO);
	}

	SanitizerArgs::SanitizerArgs(const ToolChain &TC,
	const llvm::opt::ArgList &Args) {
	SanitizerMask AllRemove; // During the loop below, the accumulated set of
	// sanitizers disabled by the current sanitizer
	// argument or any argument after it.
	SanitizerMask AllAddedKinds; // Mask of all sanitizers ever enabled by
	// -fsanitize= flags (directly or via group
	// expansion), some of which may be disabled
	// later. Used to carefully prune
	// unused-argument diagnostics.
	SanitizerMask DiagnosedKinds; // All Kinds we have diagnosed up to now.
	// Used to deduplicate diagnostics.
	SanitizerMask Kinds;
	const SanitizerMask Supported = setGroupBits(TC.getSupportedSanitizers());

	CfiCrossDso = Args.hasFlag(options::OPT_fsanitize_cfi_cross_dso,
	options::OPT_fno_sanitize_cfi_cross_dso, false);

	ToolChain::RTTIMode RTTIMode = TC.getRTTIMode();

	const Driver &D = TC.getDriver();
	SanitizerMask TrappingKinds = parseSanitizeTrapArgs(D, Args);
	SanitizerMask InvalidTrappingKinds = TrappingKinds & NotAllowedWithTrap;

	MinimalRuntime =
	Args.hasFlag(options::OPT_fsanitize_minimal_runtime,
	options::OPT_fno_sanitize_minimal_runtime, MinimalRuntime);

	// The object size sanitizer should not be enabled at -O0.
	Arg *OptLevel = Args.getLastArg(options::OPT_O_Group);
	bool RemoveObjectSizeAtO0 =
	!OptLevel \|\| OptLevel->getOption().matches(options::OPT_O0);

	for (ArgList::const_reverse_iterator I = Args.rbegin(), E = Args.rend();
	I != E; ++I) {
	const auto Arg = I;
	if (Arg->getOption().matches(options::OPT_fsanitize_EQ)) {
	Arg->claim();
	SanitizerMask Add = parseArgValues(D, Arg, /AllowGroups=/true);

	if (RemoveObjectSizeAtO0) {
	AllRemove \|= SanitizerKind::ObjectSize;

	// The user explicitly enabled the object size sanitizer. Warn
	// that this does nothing at -O0.
	if (Add & SanitizerKind::ObjectSize)
	D.Diag(diag::warn_drv_object_size_disabled_O0)
	<< Arg->getAsString(Args);
	}

	AllAddedKinds \|= expandSanitizerGroups(Add);

	// Avoid diagnosing any sanitizer which is disabled later.
	Add &= ~AllRemove;
	// At this point we have not expanded groups, so any unsupported
	// sanitizers in Add are those which have been explicitly enabled.
	// Diagnose them.
	if (SanitizerMask KindsToDiagnose =
	Add & InvalidTrappingKinds & ~DiagnosedKinds) {
	std::string Desc = describeSanitizeArg(*I, KindsToDiagnose);
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< Desc << "-fsanitize-trap=undefined";
	DiagnosedKinds \|= KindsToDiagnose;
	}
	Add &= ~InvalidTrappingKinds;

	if (MinimalRuntime) {
	if (SanitizerMask KindsToDiagnose =
	Add & NotAllowedWithMinimalRuntime & ~DiagnosedKinds) {
	std::string Desc = describeSanitizeArg(*I, KindsToDiagnose);
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< Desc << "-fsanitize-minimal-runtime";
	DiagnosedKinds \|= KindsToDiagnose;
	}
	Add &= ~NotAllowedWithMinimalRuntime;
	}

	// FIXME: Make CFI on member function calls compatible with cross-DSO CFI.
	// There are currently two problems:
	// - Virtual function call checks need to pass a pointer to the function
	// address to llvm.type.test and a pointer to the address point to the
	// diagnostic function. Currently we pass the same pointer to both
	// places.
	// - Non-virtual function call checks may need to check multiple type
	// identifiers.
	// Fixing both of those may require changes to the cross-DSO CFI
	// interface.
	if (CfiCrossDso && (Add & SanitizerKind::CFIMFCall & ~DiagnosedKinds)) {
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize=cfi-mfcall"
	<< "-fsanitize-cfi-cross-dso";
	Add &= ~SanitizerKind::CFIMFCall;
	DiagnosedKinds \|= SanitizerKind::CFIMFCall;
	}

	if (SanitizerMask KindsToDiagnose = Add & ~Supported & ~DiagnosedKinds) {
	std::string Desc = describeSanitizeArg(*I, KindsToDiagnose);
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< Desc << TC.getTriple().str();
	DiagnosedKinds \|= KindsToDiagnose;
	}
	Add &= Supported;

	// Test for -fno-rtti + explicit -fsanitizer=vptr before expanding groups
	// so we don't error out if -fno-rtti and -fsanitize=undefined were
	// passed.
	if ((Add & SanitizerKind::Vptr) && (RTTIMode == ToolChain::RM_Disabled)) {
	if (const llvm::opt::Arg *NoRTTIArg = TC.getRTTIArg()) {
	assert(NoRTTIArg->getOption().matches(options::OPT_fno_rtti) &&
	"RTTI disabled without -fno-rtti option?");
	// The user explicitly passed -fno-rtti with -fsanitize=vptr, but
	// the vptr sanitizer requires RTTI, so this is a user error.
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize=vptr" << NoRTTIArg->getAsString(Args);
	} else {
	// The vptr sanitizer requires RTTI, but RTTI is disabled (by
	// default). Warn that the vptr sanitizer is being disabled.
	D.Diag(diag::warn_drv_disabling_vptr_no_rtti_default);
	}

	// Take out the Vptr sanitizer from the enabled sanitizers
	AllRemove \|= SanitizerKind::Vptr;
	}

	Add = expandSanitizerGroups(Add);
	// Group expansion may have enabled a sanitizer which is disabled later.
	Add &= ~AllRemove;
	// Silently discard any unsupported sanitizers implicitly enabled through
	// group expansion.
	Add &= ~InvalidTrappingKinds;
	if (MinimalRuntime) {
	Add &= ~NotAllowedWithMinimalRuntime;
	}
	if (CfiCrossDso)
	Add &= ~SanitizerKind::CFIMFCall;
	Add &= Supported;

	if (Add & SanitizerKind::Fuzzer)
	Add \|= SanitizerKind::FuzzerNoLink;

	// Enable coverage if the fuzzing flag is set.
	if (Add & SanitizerKind::FuzzerNoLink) {
	CoverageFeatures \|= CoverageInline8bitCounters \| CoverageIndirCall \|
	CoverageTraceCmp \| CoveragePCTable;
	// Due to TLS differences, stack depth tracking is only enabled on Linux
	if (TC.getTriple().isOSLinux())
	CoverageFeatures \|= CoverageStackDepth;
	}

	Kinds \|= Add;
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_EQ)) {
	Arg->claim();
	SanitizerMask Remove = parseArgValues(D, Arg, true);
	AllRemove \|= expandSanitizerGroups(Remove);
	}
	}

	std::pair<SanitizerMask, SanitizerMask> IncompatibleGroups[] = {
	std::make_pair(SanitizerKind::Address,
	SanitizerKind::Thread \| SanitizerKind::Memory),
	std::make_pair(SanitizerKind::Thread, SanitizerKind::Memory),
	std::make_pair(SanitizerKind::Leak,
	SanitizerKind::Thread \| SanitizerKind::Memory),
	std::make_pair(SanitizerKind::KernelAddress,
	SanitizerKind::Address \| SanitizerKind::Leak \|
	SanitizerKind::Thread \| SanitizerKind::Memory),
	std::make_pair(SanitizerKind::HWAddress,
	SanitizerKind::Address \| SanitizerKind::Thread \|
	SanitizerKind::Memory \| SanitizerKind::KernelAddress),
	std::make_pair(SanitizerKind::Scudo,
	SanitizerKind::Address \| SanitizerKind::HWAddress \|
	SanitizerKind::Leak \| SanitizerKind::Thread \|
	SanitizerKind::Memory \| SanitizerKind::KernelAddress),
	std::make_pair(SanitizerKind::SafeStack,
	SanitizerKind::Address \| SanitizerKind::HWAddress \|
	SanitizerKind::Leak \| SanitizerKind::Thread \|
	SanitizerKind::Memory \| SanitizerKind::KernelAddress),
	std::make_pair(SanitizerKind::KernelHWAddress,
	SanitizerKind::Address \| SanitizerKind::HWAddress \|
	SanitizerKind::Leak \| SanitizerKind::Thread \|
	SanitizerKind::Memory \| SanitizerKind::KernelAddress \|
	SanitizerKind::SafeStack),
	std::make_pair(SanitizerKind::KernelMemory,
	SanitizerKind::Address \| SanitizerKind::HWAddress \|
	SanitizerKind::Leak \| SanitizerKind::Thread \|
	SanitizerKind::Memory \| SanitizerKind::KernelAddress \|
	SanitizerKind::Scudo \| SanitizerKind::SafeStack),
	std::make_pair(SanitizerKind::MemTag,
	SanitizerKind::Address \| SanitizerKind::KernelAddress \|
	SanitizerKind::HWAddress \|
	SanitizerKind::KernelHWAddress)};
	// Enable toolchain specific default sanitizers if not explicitly disabled.
	SanitizerMask Default = TC.getDefaultSanitizers() & ~AllRemove;

	// Disable default sanitizers that are incompatible with explicitly requested
	// ones.
	for (auto G : IncompatibleGroups) {
	SanitizerMask Group = G.first;
	if ((Default & Group) && (Kinds & G.second))
	Default &= ~Group;
	}

	Kinds \|= Default;

	// We disable the vptr sanitizer if it was enabled by group expansion but RTTI
	// is disabled.
	if ((Kinds & SanitizerKind::Vptr) && (RTTIMode == ToolChain::RM_Disabled)) {
	Kinds &= ~SanitizerKind::Vptr;
	}

	// Check that LTO is enabled if we need it.
	if ((Kinds & NeedsLTO) && !D.isUsingLTO()) {
	D.Diag(diag::err_drv_argument_only_allowed_with)
	<< lastArgumentForMask(D, Args, Kinds & NeedsLTO) << "-flto";
	}

	- if ((Kinds & SanitizerKind::ShadowCallStack) &&
	- TC.getTriple().getArch() == llvm::Triple::aarch64 &&
	+ if ((Kinds & SanitizerKind::ShadowCallStack) && TC.getTriple().isAArch64() &&
	!llvm::AArch64::isX18ReservedByDefault(TC.getTriple()) &&
	!Args.hasArg(options::OPT_ffixed_x18)) {
	D.Diag(diag::err_drv_argument_only_allowed_with)
	<< lastArgumentForMask(D, Args, Kinds & SanitizerKind::ShadowCallStack)
	<< "-ffixed-x18";
	}

	// Report error if there are non-trapping sanitizers that require
	// c++abi-specific parts of UBSan runtime, and they are not provided by the
	// toolchain. We don't have a good way to check the latter, so we just
	// check if the toolchan supports vptr.
	if (~Supported & SanitizerKind::Vptr) {
	SanitizerMask KindsToDiagnose = Kinds & ~TrappingKinds & NeedsUbsanCxxRt;
	// The runtime library supports the Microsoft C++ ABI, but only well enough
	// for CFI. FIXME: Remove this once we support vptr on Windows.
	if (TC.getTriple().isOSWindows())
	KindsToDiagnose &= ~SanitizerKind::CFI;
	if (KindsToDiagnose) {
	SanitizerSet S;
	S.Mask = KindsToDiagnose;
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< ("-fno-sanitize-trap=" + toString(S)) << TC.getTriple().str();
	Kinds &= ~KindsToDiagnose;
	}
	}

	// Warn about incompatible groups of sanitizers.
	for (auto G : IncompatibleGroups) {
	SanitizerMask Group = G.first;
	if (Kinds & Group) {
	if (SanitizerMask Incompatible = Kinds & G.second) {
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< lastArgumentForMask(D, Args, Group)
	<< lastArgumentForMask(D, Args, Incompatible);
	Kinds &= ~Incompatible;
	}
	}
	}
	// FIXME: Currently -fsanitize=leak is silently ignored in the presence of
	// -fsanitize=address. Perhaps it should print an error, or perhaps
	// -f(-no)sanitize=leak should change whether leak detection is enabled by
	// default in ASan?

	// Parse -f(no-)?sanitize-recover flags.
	SanitizerMask RecoverableKinds = RecoverableByDefault \| AlwaysRecoverable;
	SanitizerMask DiagnosedUnrecoverableKinds;
	SanitizerMask DiagnosedAlwaysRecoverableKinds;
	for (const auto *Arg : Args) {
	const char *DeprecatedReplacement = nullptr;
	if (Arg->getOption().matches(options::OPT_fsanitize_recover)) {
	DeprecatedReplacement =
	"-fsanitize-recover=undefined,integer' or '-fsanitize-recover=all";
	RecoverableKinds \|= expandSanitizerGroups(LegacyFsanitizeRecoverMask);
	Arg->claim();
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_recover)) {
	DeprecatedReplacement = "-fno-sanitize-recover=undefined,integer' or "
	"'-fno-sanitize-recover=all";
	RecoverableKinds &= ~expandSanitizerGroups(LegacyFsanitizeRecoverMask);
	Arg->claim();
	} else if (Arg->getOption().matches(options::OPT_fsanitize_recover_EQ)) {
	SanitizerMask Add = parseArgValues(D, Arg, true);
	// Report error if user explicitly tries to recover from unrecoverable
	// sanitizer.
	if (SanitizerMask KindsToDiagnose =
	Add & Unrecoverable & ~DiagnosedUnrecoverableKinds) {
	SanitizerSet SetToDiagnose;
	SetToDiagnose.Mask \|= KindsToDiagnose;
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< Arg->getOption().getName() << toString(SetToDiagnose);
	DiagnosedUnrecoverableKinds \|= KindsToDiagnose;
	}
	RecoverableKinds \|= expandSanitizerGroups(Add);
	Arg->claim();
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_recover_EQ)) {
	SanitizerMask Remove = parseArgValues(D, Arg, true);
	// Report error if user explicitly tries to disable recovery from
	// always recoverable sanitizer.
	if (SanitizerMask KindsToDiagnose =
	Remove & AlwaysRecoverable & ~DiagnosedAlwaysRecoverableKinds) {
	SanitizerSet SetToDiagnose;
	SetToDiagnose.Mask \|= KindsToDiagnose;
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< Arg->getOption().getName() << toString(SetToDiagnose);
	DiagnosedAlwaysRecoverableKinds \|= KindsToDiagnose;
	}
	RecoverableKinds &= ~expandSanitizerGroups(Remove);
	Arg->claim();
	}
	if (DeprecatedReplacement) {
	D.Diag(diag::warn_drv_deprecated_arg) << Arg->getAsString(Args)
	<< DeprecatedReplacement;
	}
	}
	RecoverableKinds &= Kinds;
	RecoverableKinds &= ~Unrecoverable;

	TrappingKinds &= Kinds;
	RecoverableKinds &= ~TrappingKinds;

	// Setup blacklist files.
	// Add default blacklist from resource directory.
	addDefaultBlacklists(D, Kinds, SystemBlacklistFiles);
	// Parse -f(no-)sanitize-blacklist options.
	for (const auto *Arg : Args) {
	if (Arg->getOption().matches(options::OPT_fsanitize_blacklist)) {
	Arg->claim();
	std::string BLPath = Arg->getValue();
	if (D.getVFS().exists(BLPath)) {
	UserBlacklistFiles.push_back(BLPath);
	} else {
	D.Diag(clang::diag::err_drv_no_such_file) << BLPath;
	}
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_blacklist)) {
	Arg->claim();
	UserBlacklistFiles.clear();
	SystemBlacklistFiles.clear();
	}
	}
	// Validate blacklists format.
	{
	std::string BLError;
	std::unique_ptr<llvm::SpecialCaseList> SCL(
	llvm::SpecialCaseList::create(UserBlacklistFiles, D.getVFS(), BLError));
	if (!SCL.get())
	D.Diag(clang::diag::err_drv_malformed_sanitizer_blacklist) << BLError;
	}
	{
	std::string BLError;
	std::unique_ptr<llvm::SpecialCaseList> SCL(llvm::SpecialCaseList::create(
	SystemBlacklistFiles, D.getVFS(), BLError));
	if (!SCL.get())
	D.Diag(clang::diag::err_drv_malformed_sanitizer_blacklist) << BLError;
	}

	// Parse -f[no-]sanitize-memory-track-origins[=level] options.
	if (AllAddedKinds & SanitizerKind::Memory) {
	if (Arg *A =
	Args.getLastArg(options::OPT_fsanitize_memory_track_origins_EQ,
	options::OPT_fsanitize_memory_track_origins,
	options::OPT_fno_sanitize_memory_track_origins)) {
	if (A->getOption().matches(options::OPT_fsanitize_memory_track_origins)) {
	MsanTrackOrigins = 2;
	} else if (A->getOption().matches(
	options::OPT_fno_sanitize_memory_track_origins)) {
	MsanTrackOrigins = 0;
	} else {
	StringRef S = A->getValue();
	if (S.getAsInteger(0, MsanTrackOrigins) \|\| MsanTrackOrigins < 0 \|\|
	MsanTrackOrigins > 2) {
	D.Diag(clang::diag::err_drv_invalid_value) << A->getAsString(Args) << S;
	}
	}
	}
	MsanUseAfterDtor =
	Args.hasFlag(options::OPT_fsanitize_memory_use_after_dtor,
	options::OPT_fno_sanitize_memory_use_after_dtor,
	MsanUseAfterDtor);
	NeedPIE \|= !(TC.getTriple().isOSLinux() &&
	TC.getTriple().getArch() == llvm::Triple::x86_64);
	} else {
	MsanUseAfterDtor = false;
	}

	if (AllAddedKinds & SanitizerKind::Thread) {
	TsanMemoryAccess = Args.hasFlag(
	options::OPT_fsanitize_thread_memory_access,
	options::OPT_fno_sanitize_thread_memory_access, TsanMemoryAccess);
	TsanFuncEntryExit = Args.hasFlag(
	options::OPT_fsanitize_thread_func_entry_exit,
	options::OPT_fno_sanitize_thread_func_entry_exit, TsanFuncEntryExit);
	TsanAtomics =
	Args.hasFlag(options::OPT_fsanitize_thread_atomics,
	options::OPT_fno_sanitize_thread_atomics, TsanAtomics);
	}

	if (AllAddedKinds & SanitizerKind::CFI) {
	// Without PIE, external function address may resolve to a PLT record, which
	// can not be verified by the target module.
	NeedPIE \|= CfiCrossDso;
	CfiICallGeneralizePointers =
	Args.hasArg(options::OPT_fsanitize_cfi_icall_generalize_pointers);

	if (CfiCrossDso && CfiICallGeneralizePointers)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize-cfi-cross-dso"
	<< "-fsanitize-cfi-icall-generalize-pointers";

	CfiCanonicalJumpTables =
	Args.hasFlag(options::OPT_fsanitize_cfi_canonical_jump_tables,
	options::OPT_fno_sanitize_cfi_canonical_jump_tables, true);
	}

	Stats = Args.hasFlag(options::OPT_fsanitize_stats,
	options::OPT_fno_sanitize_stats, false);

	if (MinimalRuntime) {
	SanitizerMask IncompatibleMask =
	Kinds & ~setGroupBits(CompatibleWithMinimalRuntime);
	if (IncompatibleMask)
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize-minimal-runtime"
	<< lastArgumentForMask(D, Args, IncompatibleMask);

	SanitizerMask NonTrappingCfi = Kinds & SanitizerKind::CFI & ~TrappingKinds;
	if (NonTrappingCfi)
	D.Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< "fsanitize-minimal-runtime"
	<< "fsanitize-trap=cfi";
	}

	// Parse -f(no-)?sanitize-coverage flags if coverage is supported by the
	// enabled sanitizers.
	for (const auto *Arg : Args) {
	if (Arg->getOption().matches(options::OPT_fsanitize_coverage)) {
	int LegacySanitizeCoverage;
	if (Arg->getNumValues() == 1 &&
	!StringRef(Arg->getValue(0))
	.getAsInteger(0, LegacySanitizeCoverage)) {
	CoverageFeatures = 0;
	Arg->claim();
	if (LegacySanitizeCoverage != 0) {
	D.Diag(diag::warn_drv_deprecated_arg)
	<< Arg->getAsString(Args) << "-fsanitize-coverage=trace-pc-guard";
	}
	continue;
	}
	CoverageFeatures \|= parseCoverageFeatures(D, Arg);

	// Disable coverage and not claim the flags if there is at least one
	// non-supporting sanitizer.
	if (!(AllAddedKinds & ~AllRemove & ~setGroupBits(SupportsCoverage))) {
	Arg->claim();
	} else {
	CoverageFeatures = 0;
	}
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_coverage)) {
	Arg->claim();
	CoverageFeatures &= ~parseCoverageFeatures(D, Arg);
	}
	}
	// Choose at most one coverage type: function, bb, or edge.
	if ((CoverageFeatures & CoverageFunc) && (CoverageFeatures & CoverageBB))
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize-coverage=func"
	<< "-fsanitize-coverage=bb";
	if ((CoverageFeatures & CoverageFunc) && (CoverageFeatures & CoverageEdge))
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize-coverage=func"
	<< "-fsanitize-coverage=edge";
	if ((CoverageFeatures & CoverageBB) && (CoverageFeatures & CoverageEdge))
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< "-fsanitize-coverage=bb"
	<< "-fsanitize-coverage=edge";
	// Basic block tracing and 8-bit counters require some type of coverage
	// enabled.
	if (CoverageFeatures & CoverageTraceBB)
	D.Diag(clang::diag::warn_drv_deprecated_arg)
	<< "-fsanitize-coverage=trace-bb"
	<< "-fsanitize-coverage=trace-pc-guard";
	if (CoverageFeatures & Coverage8bitCounters)
	D.Diag(clang::diag::warn_drv_deprecated_arg)
	<< "-fsanitize-coverage=8bit-counters"
	<< "-fsanitize-coverage=trace-pc-guard";

	int InsertionPointTypes = CoverageFunc \| CoverageBB \| CoverageEdge;
	int InstrumentationTypes =
	CoverageTracePC \| CoverageTracePCGuard \| CoverageInline8bitCounters;
	if ((CoverageFeatures & InsertionPointTypes) &&
	!(CoverageFeatures & InstrumentationTypes)) {
	D.Diag(clang::diag::warn_drv_deprecated_arg)
	<< "-fsanitize-coverage=[func\|bb\|edge]"
	<< "-fsanitize-coverage=[func\|bb\|edge],[trace-pc-guard\|trace-pc]";
	}

	// trace-pc w/o func/bb/edge implies edge.
	if (!(CoverageFeatures & InsertionPointTypes)) {
	if (CoverageFeatures &
	(CoverageTracePC \| CoverageTracePCGuard \| CoverageInline8bitCounters))
	CoverageFeatures \|= CoverageEdge;

	if (CoverageFeatures & CoverageStackDepth)
	CoverageFeatures \|= CoverageFunc;
	}

	SharedRuntime =
	Args.hasFlag(options::OPT_shared_libsan, options::OPT_static_libsan,
	TC.getTriple().isAndroid() \|\| TC.getTriple().isOSFuchsia() \|\|
	TC.getTriple().isOSDarwin());

	ImplicitCfiRuntime = TC.getTriple().isAndroid();

	if (AllAddedKinds & SanitizerKind::Address) {
	NeedPIE \|= TC.getTriple().isOSFuchsia();
	if (Arg *A =
	Args.getLastArg(options::OPT_fsanitize_address_field_padding)) {
	StringRef S = A->getValue();
	// Legal values are 0 and 1, 2, but in future we may add more levels.
	if (S.getAsInteger(0, AsanFieldPadding) \|\| AsanFieldPadding < 0 \|\|
	AsanFieldPadding > 2) {
	D.Diag(clang::diag::err_drv_invalid_value) << A->getAsString(Args) << S;
	}
	}

	if (Arg *WindowsDebugRTArg =
	Args.getLastArg(options::OPT__SLASH_MTd, options::OPT__SLASH_MT,
	options::OPT__SLASH_MDd, options::OPT__SLASH_MD,
	options::OPT__SLASH_LDd, options::OPT__SLASH_LD)) {
	switch (WindowsDebugRTArg->getOption().getID()) {
	case options::OPT__SLASH_MTd:
	case options::OPT__SLASH_MDd:
	case options::OPT__SLASH_LDd:
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< WindowsDebugRTArg->getAsString(Args)
	<< lastArgumentForMask(D, Args, SanitizerKind::Address);
	D.Diag(clang::diag::note_drv_address_sanitizer_debug_runtime);
	}
	}

	AsanUseAfterScope = Args.hasFlag(
	options::OPT_fsanitize_address_use_after_scope,
	options::OPT_fno_sanitize_address_use_after_scope, AsanUseAfterScope);

	AsanPoisonCustomArrayCookie = Args.hasFlag(
	options::OPT_fsanitize_address_poison_custom_array_cookie,
	options::OPT_fno_sanitize_address_poison_custom_array_cookie,
	AsanPoisonCustomArrayCookie);

	// As a workaround for a bug in gold 2.26 and earlier, dead stripping of
	// globals in ASan is disabled by default on ELF targets.
	// See https://sourceware.org/bugzilla/show_bug.cgi?id=19002
	AsanGlobalsDeadStripping =
	!TC.getTriple().isOSBinFormatELF() \|\| TC.getTriple().isOSFuchsia() \|\|
	TC.getTriple().isPS4() \|\|
	Args.hasArg(options::OPT_fsanitize_address_globals_dead_stripping);

	AsanUseOdrIndicator =
	Args.hasFlag(options::OPT_fsanitize_address_use_odr_indicator,
	options::OPT_fno_sanitize_address_use_odr_indicator,
	AsanUseOdrIndicator);

	if (AllAddedKinds & SanitizerKind::PointerCompare & ~AllRemove) {
	AsanInvalidPointerCmp = true;
	}

	if (AllAddedKinds & SanitizerKind::PointerSubtract & ~AllRemove) {
	AsanInvalidPointerSub = true;
	}

	} else {
	AsanUseAfterScope = false;
	// -fsanitize=pointer-compare/pointer-subtract requires -fsanitize=address.
	SanitizerMask DetectInvalidPointerPairs =
	SanitizerKind::PointerCompare \| SanitizerKind::PointerSubtract;
	if (AllAddedKinds & DetectInvalidPointerPairs & ~AllRemove) {
	TC.getDriver().Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< lastArgumentForMask(D, Args,
	SanitizerKind::PointerCompare \|
	SanitizerKind::PointerSubtract)
	<< "-fsanitize=address";
	}
	}

	if (AllAddedKinds & SanitizerKind::HWAddress) {
	if (Arg *HwasanAbiArg =
	Args.getLastArg(options::OPT_fsanitize_hwaddress_abi_EQ)) {
	HwasanAbi = HwasanAbiArg->getValue();
	if (HwasanAbi != "platform" && HwasanAbi != "interceptor")
	D.Diag(clang::diag::err_drv_invalid_value)
	<< HwasanAbiArg->getAsString(Args) << HwasanAbi;
	} else {
	HwasanAbi = "interceptor";
	}
	}

	if (AllAddedKinds & SanitizerKind::SafeStack) {
	// SafeStack runtime is built into the system on Fuchsia.
	SafeStackRuntime = !TC.getTriple().isOSFuchsia();
	}

	LinkRuntimes =
	Args.hasFlag(options::OPT_fsanitize_link_runtime,
	options::OPT_fno_sanitize_link_runtime, LinkRuntimes);

	// Parse -link-cxx-sanitizer flag.
	LinkCXXRuntimes = Args.hasArg(options::OPT_fsanitize_link_cxx_runtime,
	options::OPT_fno_sanitize_link_cxx_runtime,
	LinkCXXRuntimes) \|\|
	D.CCCIsCXX();

	// Finally, initialize the set of available and recoverable sanitizers.
	Sanitizers.Mask \|= Kinds;
	RecoverableSanitizers.Mask \|= RecoverableKinds;
	TrapSanitizers.Mask \|= TrappingKinds;
	assert(!(RecoverableKinds & TrappingKinds) &&
	"Overlap between recoverable and trapping sanitizers");
	}

	static std::string toString(const clang::SanitizerSet &Sanitizers) {
	std::string Res;
	#define SANITIZER(NAME, ID) \
	if (Sanitizers.has(SanitizerKind::ID)) { \
	if (!Res.empty()) \
	Res += ","; \
	Res += NAME; \
	}
	#include "clang/Basic/Sanitizers.def"
	return Res;
	}

	static void addIncludeLinkerOption(const ToolChain &TC,
	const llvm::opt::ArgList &Args,
	llvm::opt::ArgStringList &CmdArgs,
	StringRef SymbolName) {
	SmallString<64> LinkerOptionFlag;
	LinkerOptionFlag = "--linker-option=/include:";
	if (TC.getTriple().getArch() == llvm::Triple::x86) {
	// Win32 mangles C function names with a '_' prefix.
	LinkerOptionFlag += '_';
	}
	LinkerOptionFlag += SymbolName;
	CmdArgs.push_back(Args.MakeArgString(LinkerOptionFlag));
	}

	static bool hasTargetFeatureMTE(const llvm::opt::ArgStringList &CmdArgs) {
	for (auto Start = CmdArgs.begin(), End = CmdArgs.end(); Start != End; ++Start) {
	auto It = std::find(Start, End, StringRef("+mte"));
	if (It == End)
	break;
	if (It > Start && *std::prev(It) == StringRef("-target-feature"))
	return true;
	Start = It;
	}
	return false;
	}

	void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
	llvm::opt::ArgStringList &CmdArgs,
	types::ID InputType) const {
	// NVPTX doesn't currently support sanitizers. Bailing out here means that
	// e.g. -fsanitize=address applies only to host code, which is what we want
	// for now.
	if (TC.getTriple().isNVPTX())
	return;

	// Translate available CoverageFeatures to corresponding clang-cc1 flags.
	// Do it even if Sanitizers.empty() since some forms of coverage don't require
	// sanitizers.
	std::pair<int, const char *> CoverageFlags[] = {
	std::make_pair(CoverageFunc, "-fsanitize-coverage-type=1"),
	std::make_pair(CoverageBB, "-fsanitize-coverage-type=2"),
	std::make_pair(CoverageEdge, "-fsanitize-coverage-type=3"),
	std::make_pair(CoverageIndirCall, "-fsanitize-coverage-indirect-calls"),
	std::make_pair(CoverageTraceBB, "-fsanitize-coverage-trace-bb"),
	std::make_pair(CoverageTraceCmp, "-fsanitize-coverage-trace-cmp"),
	std::make_pair(CoverageTraceDiv, "-fsanitize-coverage-trace-div"),
	std::make_pair(CoverageTraceGep, "-fsanitize-coverage-trace-gep"),
	std::make_pair(Coverage8bitCounters, "-fsanitize-coverage-8bit-counters"),
	std::make_pair(CoverageTracePC, "-fsanitize-coverage-trace-pc"),
	std::make_pair(CoverageTracePCGuard, "-fsanitize-coverage-trace-pc-guard"),
	std::make_pair(CoverageInline8bitCounters, "-fsanitize-coverage-inline-8bit-counters"),
	std::make_pair(CoveragePCTable, "-fsanitize-coverage-pc-table"),
	std::make_pair(CoverageNoPrune, "-fsanitize-coverage-no-prune"),
	std::make_pair(CoverageStackDepth, "-fsanitize-coverage-stack-depth")};
	for (auto F : CoverageFlags) {
	if (CoverageFeatures & F.first)
	CmdArgs.push_back(F.second);
	}

	if (TC.getTriple().isOSWindows() && needsUbsanRt()) {
	// Instruct the code generator to embed linker directives in the object file
	// that cause the required runtime libraries to be linked.
	CmdArgs.push_back(Args.MakeArgString(
	"--dependent-lib=" + TC.getCompilerRT(Args, "ubsan_standalone")));
	if (types::isCXX(InputType))
	CmdArgs.push_back(Args.MakeArgString(
	"--dependent-lib=" + TC.getCompilerRT(Args, "ubsan_standalone_cxx")));
	}
	if (TC.getTriple().isOSWindows() && needsStatsRt()) {
	CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
	TC.getCompilerRT(Args, "stats_client")));

	// The main executable must export the stats runtime.
	// FIXME: Only exporting from the main executable (e.g. based on whether the
	// translation unit defines main()) would save a little space, but having
	// multiple copies of the runtime shouldn't hurt.
	CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
	TC.getCompilerRT(Args, "stats")));
	addIncludeLinkerOption(TC, Args, CmdArgs, "__sanitizer_stats_register");
	}

	if (Sanitizers.empty())
	return;
	CmdArgs.push_back(Args.MakeArgString("-fsanitize=" + toString(Sanitizers)));

	if (!RecoverableSanitizers.empty())
	CmdArgs.push_back(Args.MakeArgString("-fsanitize-recover=" +
	toString(RecoverableSanitizers)));

	if (!TrapSanitizers.empty())
	CmdArgs.push_back(
	Args.MakeArgString("-fsanitize-trap=" + toString(TrapSanitizers)));

	for (const auto &BLPath : UserBlacklistFiles) {
	SmallString<64> BlacklistOpt("-fsanitize-blacklist=");
	BlacklistOpt += BLPath;
	CmdArgs.push_back(Args.MakeArgString(BlacklistOpt));
	}
	for (const auto &BLPath : SystemBlacklistFiles) {
	SmallString<64> BlacklistOpt("-fsanitize-system-blacklist=");
	BlacklistOpt += BLPath;
	CmdArgs.push_back(Args.MakeArgString(BlacklistOpt));
	}

	if (MsanTrackOrigins)
	CmdArgs.push_back(Args.MakeArgString("-fsanitize-memory-track-origins=" +
	Twine(MsanTrackOrigins)));

	if (MsanUseAfterDtor)
	CmdArgs.push_back("-fsanitize-memory-use-after-dtor");

	// FIXME: Pass these parameters as function attributes, not as -llvm flags.
	if (!TsanMemoryAccess) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-tsan-instrument-memory-accesses=0");
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-tsan-instrument-memintrinsics=0");
	}
	if (!TsanFuncEntryExit) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-tsan-instrument-func-entry-exit=0");
	}
	if (!TsanAtomics) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-tsan-instrument-atomics=0");
	}

	if (CfiCrossDso)
	CmdArgs.push_back("-fsanitize-cfi-cross-dso");

	if (CfiICallGeneralizePointers)
	CmdArgs.push_back("-fsanitize-cfi-icall-generalize-pointers");

	if (CfiCanonicalJumpTables)
	CmdArgs.push_back("-fsanitize-cfi-canonical-jump-tables");

	if (Stats)
	CmdArgs.push_back("-fsanitize-stats");

	if (MinimalRuntime)
	CmdArgs.push_back("-fsanitize-minimal-runtime");

	if (AsanFieldPadding)
	CmdArgs.push_back(Args.MakeArgString("-fsanitize-address-field-padding=" +
	Twine(AsanFieldPadding)));

	if (AsanUseAfterScope)
	CmdArgs.push_back("-fsanitize-address-use-after-scope");

	if (AsanPoisonCustomArrayCookie)
	CmdArgs.push_back("-fsanitize-address-poison-custom-array-cookie");

	if (AsanGlobalsDeadStripping)
	CmdArgs.push_back("-fsanitize-address-globals-dead-stripping");

	if (AsanUseOdrIndicator)
	CmdArgs.push_back("-fsanitize-address-use-odr-indicator");

	if (AsanInvalidPointerCmp) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-asan-detect-invalid-pointer-cmp");
	}

	if (AsanInvalidPointerSub) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-asan-detect-invalid-pointer-sub");
	}

	if (!HwasanAbi.empty()) {
	CmdArgs.push_back("-default-function-attr");
	CmdArgs.push_back(Args.MakeArgString("hwasan-abi=" + HwasanAbi));
	}

	if (Sanitizers.has(SanitizerKind::HWAddress)) {
	CmdArgs.push_back("-target-feature");
	CmdArgs.push_back("+tagged-globals");
	}

	// MSan: Workaround for PR16386.
	// ASan: This is mainly to help LSan with cases such as
	// https://github.com/google/sanitizers/issues/373
	// We can't make this conditional on -fsanitize=leak, as that flag shouldn't
	// affect compilation.
	if (Sanitizers.has(SanitizerKind::Memory) \|\|
	Sanitizers.has(SanitizerKind::Address))
	CmdArgs.push_back("-fno-assume-sane-operator-new");

	// Require -fvisibility= flag on non-Windows when compiling if vptr CFI is
	// enabled.
	if (Sanitizers.hasOneOf(CFIClasses) && !TC.getTriple().isOSWindows() &&
	!Args.hasArg(options::OPT_fvisibility_EQ)) {
	TC.getDriver().Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< lastArgumentForMask(TC.getDriver(), Args,
	Sanitizers.Mask & CFIClasses)
	<< "-fvisibility=";
	}

	if (Sanitizers.has(SanitizerKind::MemTag) && !hasTargetFeatureMTE(CmdArgs))
	TC.getDriver().Diag(diag::err_stack_tagging_requires_hardware_feature);
	}

	SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A,
	bool DiagnoseErrors) {
	assert((A->getOption().matches(options::OPT_fsanitize_EQ) \|\|
	A->getOption().matches(options::OPT_fno_sanitize_EQ) \|\|
	A->getOption().matches(options::OPT_fsanitize_recover_EQ) \|\|
	A->getOption().matches(options::OPT_fno_sanitize_recover_EQ) \|\|
	A->getOption().matches(options::OPT_fsanitize_trap_EQ) \|\|
	A->getOption().matches(options::OPT_fno_sanitize_trap_EQ)) &&
	"Invalid argument in parseArgValues!");
	SanitizerMask Kinds;
	for (int i = 0, n = A->getNumValues(); i != n; ++i) {
	const char *Value = A->getValue(i);
	SanitizerMask Kind;
	// Special case: don't accept -fsanitize=all.
	if (A->getOption().matches(options::OPT_fsanitize_EQ) &&
	0 == strcmp("all", Value))
	Kind = SanitizerMask();
	else
	Kind = parseSanitizerValue(Value, /AllowGroups=/true);

	if (Kind)
	Kinds \|= Kind;
	else if (DiagnoseErrors)
	D.Diag(clang::diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	return Kinds;
	}

	int parseCoverageFeatures(const Driver &D, const llvm::opt::Arg *A) {
	assert(A->getOption().matches(options::OPT_fsanitize_coverage) \|\|
	A->getOption().matches(options::OPT_fno_sanitize_coverage));
	int Features = 0;
	for (int i = 0, n = A->getNumValues(); i != n; ++i) {
	const char *Value = A->getValue(i);
	int F = llvm::StringSwitch<int>(Value)
	.Case("func", CoverageFunc)
	.Case("bb", CoverageBB)
	.Case("edge", CoverageEdge)
	.Case("indirect-calls", CoverageIndirCall)
	.Case("trace-bb", CoverageTraceBB)
	.Case("trace-cmp", CoverageTraceCmp)
	.Case("trace-div", CoverageTraceDiv)
	.Case("trace-gep", CoverageTraceGep)
	.Case("8bit-counters", Coverage8bitCounters)
	.Case("trace-pc", CoverageTracePC)
	.Case("trace-pc-guard", CoverageTracePCGuard)
	.Case("no-prune", CoverageNoPrune)
	.Case("inline-8bit-counters", CoverageInline8bitCounters)
	.Case("pc-table", CoveragePCTable)
	.Case("stack-depth", CoverageStackDepth)
	.Default(0);
	if (F == 0)
	D.Diag(clang::diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	Features \|= F;
	}
	return Features;
	}

	std::string lastArgumentForMask(const Driver &D, const llvm::opt::ArgList &Args,
	SanitizerMask Mask) {
	for (llvm::opt::ArgList::const_reverse_iterator I = Args.rbegin(),
	E = Args.rend();
	I != E; ++I) {
	const auto Arg = I;
	if (Arg->getOption().matches(options::OPT_fsanitize_EQ)) {
	SanitizerMask AddKinds =
	expandSanitizerGroups(parseArgValues(D, Arg, false));
	if (AddKinds & Mask)
	return describeSanitizeArg(Arg, Mask);
	} else if (Arg->getOption().matches(options::OPT_fno_sanitize_EQ)) {
	SanitizerMask RemoveKinds =
	expandSanitizerGroups(parseArgValues(D, Arg, false));
	Mask &= ~RemoveKinds;
	}
	}
	llvm_unreachable("arg list didn't provide expected value");
	}

	std::string describeSanitizeArg(const llvm::opt::Arg *A, SanitizerMask Mask) {
	assert(A->getOption().matches(options::OPT_fsanitize_EQ)
	&& "Invalid argument in describeSanitizerArg!");

	std::string Sanitizers;
	for (int i = 0, n = A->getNumValues(); i != n; ++i) {
	if (expandSanitizerGroups(
	parseSanitizerValue(A->getValue(i), /AllowGroups=/true)) &
	Mask) {
	if (!Sanitizers.empty())
	Sanitizers += ",";
	Sanitizers += A->getValue(i);
	}
	}

	assert(!Sanitizers.empty() && "arg didn't provide expected value");
	return "-fsanitize=" + Sanitizers;
	}
	Index: head/contrib/llvm-project/clang/lib/Driver/ToolChain.cpp
	===================================================================
	--- head/contrib/llvm-project/clang/lib/Driver/ToolChain.cpp (revision 362608)
	+++ head/contrib/llvm-project/clang/lib/Driver/ToolChain.cpp (revision 362609)
	@@ -1,1095 +1,1092 @@
	//===- ToolChain.cpp - Collections of tools for one platform --------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "clang/Driver/ToolChain.h"
	#include "InputInfo.h"
	#include "ToolChains/Arch/ARM.h"
	#include "ToolChains/Clang.h"
	#include "ToolChains/InterfaceStubs.h"
	#include "ToolChains/Flang.h"
	#include "clang/Basic/ObjCRuntime.h"
	#include "clang/Basic/Sanitizers.h"
	#include "clang/Config/config.h"
	#include "clang/Driver/Action.h"
	#include "clang/Driver/Driver.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Job.h"
	#include "clang/Driver/Options.h"
	#include "clang/Driver/SanitizerArgs.h"
	#include "clang/Driver/XRayArgs.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/MC/MCTargetOptions.h"
	#include "llvm/Option/Arg.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Option/OptTable.h"
	#include "llvm/Option/Option.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/TargetParser.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/VersionTuple.h"
	#include "llvm/Support/VirtualFileSystem.h"
	#include <cassert>
	#include <cstddef>
	#include <cstring>
	#include <string>

	using namespace clang;
	using namespace driver;
	using namespace tools;
	using namespace llvm;
	using namespace llvm::opt;

	static llvm::opt::Arg *GetRTTIArgument(const ArgList &Args) {
	return Args.getLastArg(options::OPT_mkernel, options::OPT_fapple_kext,
	options::OPT_fno_rtti, options::OPT_frtti);
	}

	static ToolChain::RTTIMode CalculateRTTIMode(const ArgList &Args,
	const llvm::Triple &Triple,
	const Arg *CachedRTTIArg) {
	// Explicit rtti/no-rtti args
	if (CachedRTTIArg) {
	if (CachedRTTIArg->getOption().matches(options::OPT_frtti))
	return ToolChain::RM_Enabled;
	else
	return ToolChain::RM_Disabled;
	}

	// -frtti is default, except for the PS4 CPU.
	return (Triple.isPS4CPU()) ? ToolChain::RM_Disabled : ToolChain::RM_Enabled;
	}

	ToolChain::ToolChain(const Driver &D, const llvm::Triple &T,
	const ArgList &Args)
	: D(D), Triple(T), Args(Args), CachedRTTIArg(GetRTTIArgument(Args)),
	CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)) {
	if (D.CCCIsCXX()) {
	if (auto CXXStdlibPath = getCXXStdlibPath())
	getFilePaths().push_back(*CXXStdlibPath);
	}

	if (auto RuntimePath = getRuntimePath())
	getLibraryPaths().push_back(*RuntimePath);

	std::string CandidateLibPath = getArchSpecificLibPath();
	if (getVFS().exists(CandidateLibPath))
	getFilePaths().push_back(CandidateLibPath);
	}

	void ToolChain::setTripleEnvironment(llvm::Triple::EnvironmentType Env) {
	Triple.setEnvironment(Env);
	if (EffectiveTriple != llvm::Triple())
	EffectiveTriple.setEnvironment(Env);
	}

	ToolChain::~ToolChain() = default;

	llvm::vfs::FileSystem &ToolChain::getVFS() const {
	return getDriver().getVFS();
	}

	bool ToolChain::useIntegratedAs() const {
	return Args.hasFlag(options::OPT_fintegrated_as,
	options::OPT_fno_integrated_as,
	IsIntegratedAssemblerDefault());
	}

	bool ToolChain::useRelaxRelocations() const {
	return ENABLE_X86_RELAX_RELOCATIONS;
	}

	bool ToolChain::isNoExecStackDefault() const {
	return false;
	}

	const SanitizerArgs& ToolChain::getSanitizerArgs() const {
	if (!SanitizerArguments.get())
	SanitizerArguments.reset(new SanitizerArgs(*this, Args));
	return *SanitizerArguments.get();
	}

	const XRayArgs& ToolChain::getXRayArgs() const {
	if (!XRayArguments.get())
	XRayArguments.reset(new XRayArgs(*this, Args));
	return *XRayArguments.get();
	}

	namespace {

	struct DriverSuffix {
	const char *Suffix;
	const char *ModeFlag;
	};

	} // namespace

	static const DriverSuffix *FindDriverSuffix(StringRef ProgName, size_t &Pos) {
	// A list of known driver suffixes. Suffixes are compared against the
	// program name in order. If there is a match, the frontend type is updated as
	// necessary by applying the ModeFlag.
	static const DriverSuffix DriverSuffixes[] = {
	{"clang", nullptr},
	{"clang++", "--driver-mode=g++"},
	{"clang-c++", "--driver-mode=g++"},
	{"clang-cc", nullptr},
	{"clang-cpp", "--driver-mode=cpp"},
	{"clang-g++", "--driver-mode=g++"},
	{"clang-gcc", nullptr},
	{"clang-cl", "--driver-mode=cl"},
	{"cc", nullptr},
	{"cpp", "--driver-mode=cpp"},
	{"cl", "--driver-mode=cl"},
	{"++", "--driver-mode=g++"},
	{"flang", "--driver-mode=flang"},
	};

	for (size_t i = 0; i < llvm::array_lengthof(DriverSuffixes); ++i) {
	StringRef Suffix(DriverSuffixes[i].Suffix);
	if (ProgName.endswith(Suffix)) {
	Pos = ProgName.size() - Suffix.size();
	return &DriverSuffixes[i];
	}
	}
	return nullptr;
	}

	/// Normalize the program name from argv[0] by stripping the file extension if
	/// present and lower-casing the string on Windows.
	static std::string normalizeProgramName(llvm::StringRef Argv0) {
	std::string ProgName = llvm::sys::path::stem(Argv0);
	#ifdef _WIN32
	// Transform to lowercase for case insensitive file systems.
	std::transform(ProgName.begin(), ProgName.end(), ProgName.begin(), ::tolower);
	#endif
	return ProgName;
	}

	static const DriverSuffix *parseDriverSuffix(StringRef ProgName, size_t &Pos) {
	// Try to infer frontend type and default target from the program name by
	// comparing it against DriverSuffixes in order.

	// If there is a match, the function tries to identify a target as prefix.
	// E.g. "x86_64-linux-clang" as interpreted as suffix "clang" with target
	// prefix "x86_64-linux". If such a target prefix is found, it may be
	// added via -target as implicit first argument.
	const DriverSuffix *DS = FindDriverSuffix(ProgName, Pos);

	if (!DS) {
	// Try again after stripping any trailing version number:
	// clang++3.5 -> clang++
	ProgName = ProgName.rtrim("0123456789.");
	DS = FindDriverSuffix(ProgName, Pos);
	}

	if (!DS) {
	// Try again after stripping trailing -component.
	// clang++-tot -> clang++
	ProgName = ProgName.slice(0, ProgName.rfind('-'));
	DS = FindDriverSuffix(ProgName, Pos);
	}
	return DS;
	}

	ParsedClangName
	ToolChain::getTargetAndModeFromProgramName(StringRef PN) {
	std::string ProgName = normalizeProgramName(PN);
	size_t SuffixPos;
	const DriverSuffix *DS = parseDriverSuffix(ProgName, SuffixPos);
	if (!DS)
	return {};
	size_t SuffixEnd = SuffixPos + strlen(DS->Suffix);

	size_t LastComponent = ProgName.rfind('-', SuffixPos);
	if (LastComponent == std::string::npos)
	return ParsedClangName(ProgName.substr(0, SuffixEnd), DS->ModeFlag);
	std::string ModeSuffix = ProgName.substr(LastComponent + 1,
	SuffixEnd - LastComponent - 1);

	// Infer target from the prefix.
	StringRef Prefix(ProgName);
	Prefix = Prefix.slice(0, LastComponent);
	std::string IgnoredError;
	bool IsRegistered = llvm::TargetRegistry::lookupTarget(Prefix, IgnoredError);
	return ParsedClangName{Prefix, ModeSuffix, DS->ModeFlag, IsRegistered};
	}

	StringRef ToolChain::getDefaultUniversalArchName() const {
	// In universal driver terms, the arch name accepted by -arch isn't exactly
	// the same as the ones that appear in the triple. Roughly speaking, this is
	// an inverse of the darwin::getArchTypeForDarwinArchName() function, but the
	// only interesting special case is powerpc.
	switch (Triple.getArch()) {
	case llvm::Triple::ppc:
	return "ppc";
	case llvm::Triple::ppc64:
	return "ppc64";
	case llvm::Triple::ppc64le:
	return "ppc64le";
	default:
	return Triple.getArchName();
	}
	}

	std::string ToolChain::getInputFilename(const InputInfo &Input) const {
	return Input.getFilename();
	}

	bool ToolChain::IsUnwindTablesDefault(const ArgList &Args) const {
	return false;
	}

	Tool *ToolChain::getClang() const {
	if (!Clang)
	Clang.reset(new tools::Clang(*this));
	return Clang.get();
	}

	Tool *ToolChain::getFlang() const {
	if (!Flang)
	Flang.reset(new tools::Flang(*this));
	return Flang.get();
	}

	Tool *ToolChain::buildAssembler() const {
	return new tools::ClangAs(*this);
	}

	Tool *ToolChain::buildLinker() const {
	llvm_unreachable("Linking is not supported by this toolchain");
	}

	Tool *ToolChain::getAssemble() const {
	if (!Assemble)
	Assemble.reset(buildAssembler());
	return Assemble.get();
	}

	Tool *ToolChain::getClangAs() const {
	if (!Assemble)
	Assemble.reset(new tools::ClangAs(*this));
	return Assemble.get();
	}

	Tool *ToolChain::getLink() const {
	if (!Link)
	Link.reset(buildLinker());
	return Link.get();
	}

	Tool *ToolChain::getIfsMerge() const {
	if (!IfsMerge)
	IfsMerge.reset(new tools::ifstool::Merger(*this));
	return IfsMerge.get();
	}

	Tool *ToolChain::getOffloadBundler() const {
	if (!OffloadBundler)
	OffloadBundler.reset(new tools::OffloadBundler(*this));
	return OffloadBundler.get();
	}

	Tool *ToolChain::getOffloadWrapper() const {
	if (!OffloadWrapper)
	OffloadWrapper.reset(new tools::OffloadWrapper(*this));
	return OffloadWrapper.get();
	}

	Tool *ToolChain::getTool(Action::ActionClass AC) const {
	switch (AC) {
	case Action::AssembleJobClass:
	return getAssemble();

	case Action::IfsMergeJobClass:
	return getIfsMerge();

	case Action::LinkJobClass:
	return getLink();

	case Action::InputClass:
	case Action::BindArchClass:
	case Action::OffloadClass:
	case Action::LipoJobClass:
	case Action::DsymutilJobClass:
	case Action::VerifyDebugInfoJobClass:
	llvm_unreachable("Invalid tool kind.");

	case Action::CompileJobClass:
	case Action::PrecompileJobClass:
	case Action::HeaderModulePrecompileJobClass:
	case Action::PreprocessJobClass:
	case Action::AnalyzeJobClass:
	case Action::MigrateJobClass:
	case Action::VerifyPCHJobClass:
	case Action::BackendJobClass:
	return getClang();

	case Action::OffloadBundlingJobClass:
	case Action::OffloadUnbundlingJobClass:
	return getOffloadBundler();

	case Action::OffloadWrapperJobClass:
	return getOffloadWrapper();
	}

	llvm_unreachable("Invalid tool kind.");
	}

	static StringRef getArchNameForCompilerRTLib(const ToolChain &TC,
	const ArgList &Args) {
	const llvm::Triple &Triple = TC.getTriple();
	bool IsWindows = Triple.isOSWindows();

	if (TC.getArch() == llvm::Triple::arm \|\| TC.getArch() == llvm::Triple::armeb)
	return (arm::getARMFloatABI(TC, Args) == arm::FloatABI::Hard && !IsWindows)
	? "armhf"
	: "arm";

	// For historic reasons, Android library is using i686 instead of i386.
	if (TC.getArch() == llvm::Triple::x86 && Triple.isAndroid())
	return "i686";

	return llvm::Triple::getArchTypeName(TC.getArch());
	}

	StringRef ToolChain::getOSLibName() const {
	switch (Triple.getOS()) {
	case llvm::Triple::FreeBSD:
	return "freebsd";
	case llvm::Triple::NetBSD:
	return "netbsd";
	case llvm::Triple::OpenBSD:
	return "openbsd";
	case llvm::Triple::Solaris:
	return "sunos";
	default:
	return getOS();
	}
	}

	std::string ToolChain::getCompilerRTPath() const {
	SmallString<128> Path(getDriver().ResourceDir);
	if (Triple.isOSUnknown()) {
	llvm::sys::path::append(Path, "lib");
	} else {
	llvm::sys::path::append(Path, "lib", getOSLibName());
	}
	return Path.str();
	}

	std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component,
	FileType Type) const {
	const llvm::Triple &TT = getTriple();
	bool IsITANMSVCWindows =
	TT.isWindowsMSVCEnvironment() \|\| TT.isWindowsItaniumEnvironment();

	const char *Prefix =
	IsITANMSVCWindows \|\| Type == ToolChain::FT_Object ? "" : "lib";
	const char *Suffix;
	switch (Type) {
	case ToolChain::FT_Object:
	Suffix = IsITANMSVCWindows ? ".obj" : ".o";
	break;
	case ToolChain::FT_Static:
	Suffix = IsITANMSVCWindows ? ".lib" : ".a";
	break;
	case ToolChain::FT_Shared:
	Suffix = Triple.isOSWindows()
	? (Triple.isWindowsGNUEnvironment() ? ".dll.a" : ".lib")
	: ".so";
	break;
	}

	for (const auto &LibPath : getLibraryPaths()) {
	SmallString<128> P(LibPath);
	llvm::sys::path::append(P, Prefix + Twine("clang_rt.") + Component + Suffix);
	if (getVFS().exists(P))
	return P.str();
	}

	StringRef Arch = getArchNameForCompilerRTLib(*this, Args);
	const char *Env = TT.isAndroid() ? "-android" : "";
	SmallString<128> Path(getCompilerRTPath());
	llvm::sys::path::append(Path, Prefix + Twine("clang_rt.") + Component + "-" +
	Arch + Env + Suffix);
	return Path.str();
	}

	const char *ToolChain::getCompilerRTArgString(const llvm::opt::ArgList &Args,
	StringRef Component,
	FileType Type) const {
	return Args.MakeArgString(getCompilerRT(Args, Component, Type));
	}


	Optional<std::string> ToolChain::getRuntimePath() const {
	SmallString<128> P;

	// First try the triple passed to driver as --target=<triple>.
	P.assign(D.ResourceDir);
	llvm::sys::path::append(P, "lib", D.getTargetTriple());
	if (getVFS().exists(P))
	return llvm::Optional<std::string>(P.str());

	// Second try the normalized triple.
	P.assign(D.ResourceDir);
	llvm::sys::path::append(P, "lib", Triple.str());
	if (getVFS().exists(P))
	return llvm::Optional<std::string>(P.str());

	return None;
	}

	Optional<std::string> ToolChain::getCXXStdlibPath() const {
	SmallString<128> P;

	// First try the triple passed to driver as --target=<triple>.
	P.assign(D.Dir);
	llvm::sys::path::append(P, "..", "lib", D.getTargetTriple(), "c++");
	if (getVFS().exists(P))
	return llvm::Optional<std::string>(P.str());

	// Second try the normalized triple.
	P.assign(D.Dir);
	llvm::sys::path::append(P, "..", "lib", Triple.str(), "c++");
	if (getVFS().exists(P))
	return llvm::Optional<std::string>(P.str());

	return None;
	}

	std::string ToolChain::getArchSpecificLibPath() const {
	SmallString<128> Path(getDriver().ResourceDir);
	llvm::sys::path::append(Path, "lib", getOSLibName(),
	llvm::Triple::getArchTypeName(getArch()));
	return Path.str();
	}

	bool ToolChain::needsProfileRT(const ArgList &Args) {
	if (Args.hasArg(options::OPT_noprofilelib))
	return false;

	if (needsGCovInstrumentation(Args) \|\|
	Args.hasArg(options::OPT_fprofile_generate) \|\|
	Args.hasArg(options::OPT_fprofile_generate_EQ) \|\|
	Args.hasArg(options::OPT_fcs_profile_generate) \|\|
	Args.hasArg(options::OPT_fcs_profile_generate_EQ) \|\|
	Args.hasArg(options::OPT_fprofile_instr_generate) \|\|
	Args.hasArg(options::OPT_fprofile_instr_generate_EQ) \|\|
	Args.hasArg(options::OPT_fcreate_profile) \|\|
	Args.hasArg(options::OPT_forder_file_instrumentation))
	return true;

	return false;
	}

	bool ToolChain::needsGCovInstrumentation(const llvm::opt::ArgList &Args) {
	return Args.hasFlag(options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs,
	false) \|\|
	Args.hasArg(options::OPT_coverage);
	}

	Tool *ToolChain::SelectTool(const JobAction &JA) const {
	if (D.IsFlangMode() && getDriver().ShouldUseFlangCompiler(JA)) return getFlang();
	if (getDriver().ShouldUseClangCompiler(JA)) return getClang();
	Action::ActionClass AC = JA.getKind();
	if (AC == Action::AssembleJobClass && useIntegratedAs())
	return getClangAs();
	return getTool(AC);
	}

	std::string ToolChain::GetFilePath(const char *Name) const {
	return D.GetFilePath(Name, *this);
	}

	std::string ToolChain::GetProgramPath(const char *Name) const {
	return D.GetProgramPath(Name, *this);
	}

	std::string ToolChain::GetLinkerPath() const {
	const Arg* A = Args.getLastArg(options::OPT_fuse_ld_EQ);
	StringRef UseLinker = A ? A->getValue() : CLANG_DEFAULT_LINKER;

	if (llvm::sys::path::is_absolute(UseLinker)) {
	// If we're passed what looks like an absolute path, don't attempt to
	// second-guess that.
	if (llvm::sys::fs::can_execute(UseLinker))
	return UseLinker;
	} else if (UseLinker.empty() \|\| UseLinker == "ld") {
	// If we're passed -fuse-ld= with no argument, or with the argument ld,
	// then use whatever the default system linker is.
	return GetProgramPath(getDefaultLinker());
	} else {
	llvm::SmallString<8> LinkerName;
	if (Triple.isOSDarwin())
	LinkerName.append("ld64.");
	else
	LinkerName.append("ld.");
	LinkerName.append(UseLinker);

	std::string LinkerPath(GetProgramPath(LinkerName.c_str()));
	if (llvm::sys::fs::can_execute(LinkerPath))
	return LinkerPath;
	}

	if (A)
	getDriver().Diag(diag::err_drv_invalid_linker_name) << A->getAsString(Args);

	return GetProgramPath(getDefaultLinker());
	}

	types::ID ToolChain::LookupTypeForExtension(StringRef Ext) const {
	types::ID id = types::lookupTypeForExtension(Ext);

	// Flang always runs the preprocessor and has no notion of "preprocessed
	// fortran". Here, TY_PP_Fortran is coerced to TY_Fortran to avoid treating
	// them differently.
	if (D.IsFlangMode() && id == types::TY_PP_Fortran)
	id = types::TY_Fortran;

	return id;
	}

	bool ToolChain::HasNativeLLVMSupport() const {
	return false;
	}

	bool ToolChain::isCrossCompiling() const {
	llvm::Triple HostTriple(LLVM_HOST_TRIPLE);
	switch (HostTriple.getArch()) {
	// The A32/T32/T16 instruction sets are not separate architectures in this
	// context.
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	return getArch() != llvm::Triple::arm && getArch() != llvm::Triple::thumb &&
	getArch() != llvm::Triple::armeb && getArch() != llvm::Triple::thumbeb;
	default:
	return HostTriple.getArch() != getArch();
	}
	}

	ObjCRuntime ToolChain::getDefaultObjCRuntime(bool isNonFragile) const {
	return ObjCRuntime(isNonFragile ? ObjCRuntime::GNUstep : ObjCRuntime::GCC,
	VersionTuple());
	}

	llvm::ExceptionHandling
	ToolChain::GetExceptionModel(const llvm::opt::ArgList &Args) const {
	return llvm::ExceptionHandling::None;
	}

	bool ToolChain::isThreadModelSupported(const StringRef Model) const {
	if (Model == "single") {
	// FIXME: 'single' is only supported on ARM and WebAssembly so far.
	return Triple.getArch() == llvm::Triple::arm \|\|
	Triple.getArch() == llvm::Triple::armeb \|\|
	Triple.getArch() == llvm::Triple::thumb \|\|
	Triple.getArch() == llvm::Triple::thumbeb \|\|
	Triple.getArch() == llvm::Triple::wasm32 \|\|
	Triple.getArch() == llvm::Triple::wasm64;
	} else if (Model == "posix")
	return true;

	return false;
	}

	std::string ToolChain::ComputeLLVMTriple(const ArgList &Args,
	types::ID InputType) const {
	switch (getTriple().getArch()) {
	default:
	return getTripleString();

	case llvm::Triple::x86_64: {
	llvm::Triple Triple = getTriple();
	if (!Triple.isOSBinFormatMachO())
	return getTripleString();

	if (Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
	// x86_64h goes in the triple. Other -march options just use the
	// vanilla triple we already have.
	StringRef MArch = A->getValue();
	if (MArch == "x86_64h")
	Triple.setArchName(MArch);
	}
	return Triple.getTriple();
	}
	case llvm::Triple::aarch64: {
	llvm::Triple Triple = getTriple();
	if (!Triple.isOSBinFormatMachO())
	return getTripleString();

	// FIXME: older versions of ld64 expect the "arm64" component in the actual
	// triple string and query it to determine whether an LTO file can be
	// handled. Remove this when we don't care any more.
	Triple.setArchName("arm64");
	return Triple.getTriple();
	}
	case llvm::Triple::aarch64_32:
	return getTripleString();
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb: {
	// FIXME: Factor into subclasses.
	llvm::Triple Triple = getTriple();
	bool IsBigEndian = getTriple().getArch() == llvm::Triple::armeb \|\|
	getTriple().getArch() == llvm::Triple::thumbeb;

	// Handle pseudo-target flags '-mlittle-endian'/'-EL' and
	// '-mbig-endian'/'-EB'.
	if (Arg *A = Args.getLastArg(options::OPT_mlittle_endian,
	options::OPT_mbig_endian)) {
	IsBigEndian = !A->getOption().matches(options::OPT_mlittle_endian);
	}

	// Thumb2 is the default for V7 on Darwin.
	//
	// FIXME: Thumb should just be another -target-feaure, not in the triple.
	StringRef MCPU, MArch;
	if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
	MCPU = A->getValue();
	if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
	MArch = A->getValue();
	std::string CPU =
	Triple.isOSBinFormatMachO()
	? tools::arm::getARMCPUForMArch(MArch, Triple).str()
	: tools::arm::getARMTargetCPU(MCPU, MArch, Triple);
	StringRef Suffix =
	tools::arm::getLLVMArchSuffixForARM(CPU, MArch, Triple);
	bool IsMProfile = ARM::parseArchProfile(Suffix) == ARM::ProfileKind::M;
	bool ThumbDefault = IsMProfile \|\| (ARM::parseArchVersion(Suffix) == 7 &&
	getTriple().isOSBinFormatMachO());
	// FIXME: this is invalid for WindowsCE
	if (getTriple().isOSWindows())
	ThumbDefault = true;
	std::string ArchName;
	if (IsBigEndian)
	ArchName = "armeb";
	else
	ArchName = "arm";

	// Check if ARM ISA was explicitly selected (using -mno-thumb or -marm) for
	// M-Class CPUs/architecture variants, which is not supported.
	bool ARMModeRequested = !Args.hasFlag(options::OPT_mthumb,
	options::OPT_mno_thumb, ThumbDefault);
	if (IsMProfile && ARMModeRequested) {
	if (!MCPU.empty())
	getDriver().Diag(diag::err_cpu_unsupported_isa) << CPU << "ARM";
	else
	getDriver().Diag(diag::err_arch_unsupported_isa)
	<< tools::arm::getARMArch(MArch, getTriple()) << "ARM";
	}

	// Check to see if an explicit choice to use thumb has been made via
	// -mthumb. For assembler files we must check for -mthumb in the options
	// passed to the assembler via -Wa or -Xassembler.
	bool IsThumb = false;
	if (InputType != types::TY_PP_Asm)
	IsThumb = Args.hasFlag(options::OPT_mthumb, options::OPT_mno_thumb,
	ThumbDefault);
	else {
	// Ideally we would check for these flags in
	// CollectArgsForIntegratedAssembler but we can't change the ArchName at
	// that point. There is no assembler equivalent of -mno-thumb, -marm, or
	// -mno-arm.
	for (const auto *A :
	Args.filtered(options::OPT_Wa_COMMA, options::OPT_Xassembler)) {
	for (StringRef Value : A->getValues()) {
	if (Value == "-mthumb")
	IsThumb = true;
	}
	}
	}
	// Assembly files should start in ARM mode, unless arch is M-profile, or
	// -mthumb has been passed explicitly to the assembler. Windows is always
	// thumb.
	if (IsThumb \|\| IsMProfile \|\| getTriple().isOSWindows()) {
	if (IsBigEndian)
	ArchName = "thumbeb";
	else
	ArchName = "thumb";
	}
	Triple.setArchName(ArchName + Suffix.str());

	return Triple.getTriple();
	}
	}
	}

	std::string ToolChain::ComputeEffectiveClangTriple(const ArgList &Args,
	types::ID InputType) const {
	return ComputeLLVMTriple(Args, InputType);
	}

	void ToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
	ArgStringList &CC1Args) const {
	// Each toolchain should provide the appropriate include flags.
	}

	void ToolChain::addClangTargetOptions(
	const ArgList &DriverArgs, ArgStringList &CC1Args,
	Action::OffloadKind DeviceOffloadKind) const {}

	void ToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {}

	void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args,
	llvm::opt::ArgStringList &CmdArgs) const {
	if (!needsProfileRT(Args)) return;

	CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
	}

	ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
	const ArgList &Args) const {
	const Arg* A = Args.getLastArg(options::OPT_rtlib_EQ);
	StringRef LibName = A ? A->getValue() : CLANG_DEFAULT_RTLIB;

	// Only use "platform" in tests to override CLANG_DEFAULT_RTLIB!
	if (LibName == "compiler-rt")
	return ToolChain::RLT_CompilerRT;
	else if (LibName == "libgcc")
	return ToolChain::RLT_Libgcc;
	else if (LibName == "platform")
	return GetDefaultRuntimeLibType();

	if (A)
	getDriver().Diag(diag::err_drv_invalid_rtlib_name) << A->getAsString(Args);

	return GetDefaultRuntimeLibType();
	}

	ToolChain::UnwindLibType ToolChain::GetUnwindLibType(
	const ArgList &Args) const {
	const Arg *A = Args.getLastArg(options::OPT_unwindlib_EQ);
	StringRef LibName = A ? A->getValue() : CLANG_DEFAULT_UNWINDLIB;

	if (LibName == "none")
	return ToolChain::UNW_None;
	else if (LibName == "platform" \|\| LibName == "") {
	ToolChain::RuntimeLibType RtLibType = GetRuntimeLibType(Args);
	if (RtLibType == ToolChain::RLT_CompilerRT)
	return ToolChain::UNW_None;
	else if (RtLibType == ToolChain::RLT_Libgcc)
	return ToolChain::UNW_Libgcc;
	} else if (LibName == "libunwind") {
	if (GetRuntimeLibType(Args) == RLT_Libgcc)
	getDriver().Diag(diag::err_drv_incompatible_unwindlib);
	return ToolChain::UNW_CompilerRT;
	} else if (LibName == "libgcc")
	return ToolChain::UNW_Libgcc;

	if (A)
	getDriver().Diag(diag::err_drv_invalid_unwindlib_name)
	<< A->getAsString(Args);

	return GetDefaultUnwindLibType();
	}

	ToolChain::CXXStdlibType ToolChain::GetCXXStdlibType(const ArgList &Args) const{
	const Arg *A = Args.getLastArg(options::OPT_stdlib_EQ);
	StringRef LibName = A ? A->getValue() : CLANG_DEFAULT_CXX_STDLIB;

	// Only use "platform" in tests to override CLANG_DEFAULT_CXX_STDLIB!
	if (LibName == "libc++")
	return ToolChain::CST_Libcxx;
	else if (LibName == "libstdc++")
	return ToolChain::CST_Libstdcxx;
	else if (LibName == "platform")
	return GetDefaultCXXStdlibType();

	if (A)
	getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args);

	return GetDefaultCXXStdlibType();
	}

	/// Utility function to add a system include directory to CC1 arguments.
	/static/ void ToolChain::addSystemInclude(const ArgList &DriverArgs,
	ArgStringList &CC1Args,
	const Twine &Path) {
	CC1Args.push_back("-internal-isystem");
	CC1Args.push_back(DriverArgs.MakeArgString(Path));
	}

	/// Utility function to add a system include directory with extern "C"
	/// semantics to CC1 arguments.
	///
	/// Note that this should be used rarely, and only for directories that
	/// historically and for legacy reasons are treated as having implicit extern
	/// "C" semantics. These semantics are ignored by and large today, but its
	/// important to preserve the preprocessor changes resulting from the
	/// classification.
	/static/ void ToolChain::addExternCSystemInclude(const ArgList &DriverArgs,
	ArgStringList &CC1Args,
	const Twine &Path) {
	CC1Args.push_back("-internal-externc-isystem");
	CC1Args.push_back(DriverArgs.MakeArgString(Path));
	}

	void ToolChain::addExternCSystemIncludeIfExists(const ArgList &DriverArgs,
	ArgStringList &CC1Args,
	const Twine &Path) {
	if (llvm::sys::fs::exists(Path))
	addExternCSystemInclude(DriverArgs, CC1Args, Path);
	}

	/// Utility function to add a list of system include directories to CC1.
	/static/ void ToolChain::addSystemIncludes(const ArgList &DriverArgs,
	ArgStringList &CC1Args,
	ArrayRef<StringRef> Paths) {
	for (const auto &Path : Paths) {
	CC1Args.push_back("-internal-isystem");
	CC1Args.push_back(DriverArgs.MakeArgString(Path));
	}
	}

	void ToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
	ArgStringList &CC1Args) const {
	// Header search paths should be handled by each of the subclasses.
	// Historically, they have not been, and instead have been handled inside of
	// the CC1-layer frontend. As the logic is hoisted out, this generic function
	// will slowly stop being called.
	//
	// While it is being called, replicate a bit of a hack to propagate the
	// '-stdlib=' flag down to CC1 so that it can in turn customize the C++
	// header search paths with it. Once all systems are overriding this
	// function, the CC1 flag and this line can be removed.
	DriverArgs.AddAllArgs(CC1Args, options::OPT_stdlib_EQ);
	}

	void ToolChain::AddClangCXXStdlibIsystemArgs(
	const llvm::opt::ArgList &DriverArgs,
	llvm::opt::ArgStringList &CC1Args) const {
	DriverArgs.ClaimAllArgs(options::OPT_stdlibxx_isystem);
	if (!DriverArgs.hasArg(options::OPT_nostdincxx))
	for (const auto &P :
	DriverArgs.getAllArgValues(options::OPT_stdlibxx_isystem))
	addSystemInclude(DriverArgs, CC1Args, P);
	}

	bool ToolChain::ShouldLinkCXXStdlib(const llvm::opt::ArgList &Args) const {
	return getDriver().CCCIsCXX() &&
	!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs,
	options::OPT_nostdlibxx);
	}

	void ToolChain::AddCXXStdlibLibArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	assert(!Args.hasArg(options::OPT_nostdlibxx) &&
	"should not have called this");
	CXXStdlibType Type = GetCXXStdlibType(Args);

	switch (Type) {
	case ToolChain::CST_Libcxx:
	CmdArgs.push_back("-lc++");
	break;

	case ToolChain::CST_Libstdcxx:
	CmdArgs.push_back("-lstdc++");
	break;
	}
	}

	void ToolChain::AddFilePathLibArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	for (const auto &LibPath : getFilePaths())
	if(LibPath.length() > 0)
	CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + LibPath));
	}

	void ToolChain::AddCCKextLibArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	CmdArgs.push_back("-lcc_kext");
	}

	bool ToolChain::AddFastMathRuntimeIfAvailable(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	// Do not check for -fno-fast-math or -fno-unsafe-math when -Ofast passed
	// (to keep the linker options consistent with gcc and clang itself).
	if (!isOptimizationLevelFast(Args)) {
	// Check if -ffast-math or -funsafe-math.
	Arg *A =
	Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math,
	options::OPT_funsafe_math_optimizations,
	options::OPT_fno_unsafe_math_optimizations);

	if (!A \|\| A->getOption().getID() == options::OPT_fno_fast_math \|\|
	A->getOption().getID() == options::OPT_fno_unsafe_math_optimizations)
	return false;
	}
	// If crtfastmath.o exists add it to the arguments.
	std::string Path = GetFilePath("crtfastmath.o");
	if (Path == "crtfastmath.o") // Not found.
	return false;

	CmdArgs.push_back(Args.MakeArgString(Path));
	return true;
	}

	SanitizerMask ToolChain::getSupportedSanitizers() const {
	// Return sanitizers which don't require runtime support and are not
	// platform dependent.

	SanitizerMask Res = (SanitizerKind::Undefined & ~SanitizerKind::Vptr &
	~SanitizerKind::Function) \|
	(SanitizerKind::CFI & ~SanitizerKind::CFIICall) \|
	SanitizerKind::CFICastStrict \|
	SanitizerKind::FloatDivideByZero \|
	SanitizerKind::UnsignedIntegerOverflow \|
	SanitizerKind::ImplicitConversion \|
	SanitizerKind::Nullability \| SanitizerKind::LocalBounds;
	if (getTriple().getArch() == llvm::Triple::x86 \|\|
	getTriple().getArch() == llvm::Triple::x86_64 \|\|
	getTriple().getArch() == llvm::Triple::arm \|\|
	- getTriple().getArch() == llvm::Triple::aarch64 \|\|
	getTriple().getArch() == llvm::Triple::wasm32 \|\|
	- getTriple().getArch() == llvm::Triple::wasm64)
	+ getTriple().getArch() == llvm::Triple::wasm64 \|\| getTriple().isAArch64())
	Res \|= SanitizerKind::CFIICall;
	- if (getTriple().getArch() == llvm::Triple::x86_64 \|\|
	- getTriple().getArch() == llvm::Triple::aarch64)
	+ if (getTriple().getArch() == llvm::Triple::x86_64 \|\| getTriple().isAArch64())
	Res \|= SanitizerKind::ShadowCallStack;
	- if (getTriple().getArch() == llvm::Triple::aarch64 \|\|
	- getTriple().getArch() == llvm::Triple::aarch64_be)
	+ if (getTriple().isAArch64())
	Res \|= SanitizerKind::MemTag;
	return Res;
	}

	void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
	ArgStringList &CC1Args) const {}

	void ToolChain::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
	ArgStringList &CC1Args) const {}

	static VersionTuple separateMSVCFullVersion(unsigned Version) {
	if (Version < 100)
	return VersionTuple(Version);

	if (Version < 10000)
	return VersionTuple(Version / 100, Version % 100);

	unsigned Build = 0, Factor = 1;
	for (; Version > 10000; Version = Version / 10, Factor = Factor * 10)
	Build = Build + (Version % 10) * Factor;
	return VersionTuple(Version / 100, Version % 100, Build);
	}

	VersionTuple
	ToolChain::computeMSVCVersion(const Driver *D,
	const llvm::opt::ArgList &Args) const {
	const Arg *MSCVersion = Args.getLastArg(options::OPT_fmsc_version);
	const Arg *MSCompatibilityVersion =
	Args.getLastArg(options::OPT_fms_compatibility_version);

	if (MSCVersion && MSCompatibilityVersion) {
	if (D)
	D->Diag(diag::err_drv_argument_not_allowed_with)
	<< MSCVersion->getAsString(Args)
	<< MSCompatibilityVersion->getAsString(Args);
	return VersionTuple();
	}

	if (MSCompatibilityVersion) {
	VersionTuple MSVT;
	if (MSVT.tryParse(MSCompatibilityVersion->getValue())) {
	if (D)
	D->Diag(diag::err_drv_invalid_value)
	<< MSCompatibilityVersion->getAsString(Args)
	<< MSCompatibilityVersion->getValue();
	} else {
	return MSVT;
	}
	}

	if (MSCVersion) {
	unsigned Version = 0;
	if (StringRef(MSCVersion->getValue()).getAsInteger(10, Version)) {
	if (D)
	D->Diag(diag::err_drv_invalid_value)
	<< MSCVersion->getAsString(Args) << MSCVersion->getValue();
	} else {
	return separateMSVCFullVersion(Version);
	}
	}

	return VersionTuple();
	}

	llvm::opt::DerivedArgList *ToolChain::TranslateOpenMPTargetArgs(
	const llvm::opt::DerivedArgList &Args, bool SameTripleAsHost,
	SmallVectorImpl<llvm::opt::Arg *> &AllocatedArgs) const {
	DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
	const OptTable &Opts = getDriver().getOpts();
	bool Modified = false;

	// Handle -Xopenmp-target flags
	for (auto *A : Args) {
	// Exclude flags which may only apply to the host toolchain.
	// Do not exclude flags when the host triple (AuxTriple)
	// matches the current toolchain triple. If it is not present
	// at all, target and host share a toolchain.
	if (A->getOption().matches(options::OPT_m_Group)) {
	if (SameTripleAsHost)
	DAL->append(A);
	else
	Modified = true;
	continue;
	}

	unsigned Index;
	unsigned Prev;
	bool XOpenMPTargetNoTriple =
	A->getOption().matches(options::OPT_Xopenmp_target);

	if (A->getOption().matches(options::OPT_Xopenmp_target_EQ)) {
	// Passing device args: -Xopenmp-target=<triple> -opt=val.
	if (A->getValue(0) == getTripleString())
	Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
	else
	continue;
	} else if (XOpenMPTargetNoTriple) {
	// Passing device args: -Xopenmp-target -opt=val.
	Index = Args.getBaseArgs().MakeIndex(A->getValue(0));
	} else {
	DAL->append(A);
	continue;
	}

	// Parse the argument to -Xopenmp-target.
	Prev = Index;
	std::unique_ptr<Arg> XOpenMPTargetArg(Opts.ParseOneArg(Args, Index));
	if (!XOpenMPTargetArg \|\| Index > Prev + 1) {
	getDriver().Diag(diag::err_drv_invalid_Xopenmp_target_with_args)
	<< A->getAsString(Args);
	continue;
	}
	if (XOpenMPTargetNoTriple && XOpenMPTargetArg &&
	Args.getAllArgValues(options::OPT_fopenmp_targets_EQ).size() != 1) {
	getDriver().Diag(diag::err_drv_Xopenmp_target_missing_triple);
	continue;
	}
	XOpenMPTargetArg->setBaseArg(A);
	A = XOpenMPTargetArg.release();
	AllocatedArgs.push_back(A);
	DAL->append(A);
	Modified = true;
	}

	if (Modified)
	return DAL;

	delete DAL;
	return nullptr;
	}
	Index: head/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp
	===================================================================
	--- head/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp (revision 362608)
	+++ head/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp (revision 362609)
	@@ -1,175 +1,198 @@
	//===--- X86.cpp - X86 Helpers for Tools ------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "ToolChains/CommonArgs.h"
	#include "clang/Driver/Driver.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Options.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Support/Host.h"

	using namespace clang::driver;
	using namespace clang::driver::tools;
	using namespace clang;
	using namespace llvm::opt;

	const char *x86::getX86TargetCPU(const ArgList &Args,
	const llvm::Triple &Triple) {
	if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
	if (StringRef(A->getValue()) != "native")
	return A->getValue();

	// FIXME: Reject attempts to use -march=native unless the target matches
	// the host.
	//
	// FIXME: We should also incorporate the detected target features for use
	// with -native.
	std::string CPU = llvm::sys::getHostCPUName();
	if (!CPU.empty() && CPU != "generic")
	return Args.MakeArgString(CPU);
	}

	if (const Arg *A = Args.getLastArgNoClaim(options::OPT__SLASH_arch)) {
	// Mapping built by looking at lib/Basic's X86TargetInfo::initFeatureMap().
	StringRef Arch = A->getValue();
	const char *CPU = nullptr;
	if (Triple.getArch() == llvm::Triple::x86) { // 32-bit-only /arch: flags.
	CPU = llvm::StringSwitch<const char *>(Arch)
	.Case("IA32", "i386")
	.Case("SSE", "pentium3")
	.Case("SSE2", "pentium4")
	.Default(nullptr);
	}
	if (CPU == nullptr) { // 32-bit and 64-bit /arch: flags.
	CPU = llvm::StringSwitch<const char *>(Arch)
	.Case("AVX", "sandybridge")
	.Case("AVX2", "haswell")
	.Case("AVX512F", "knl")
	.Case("AVX512", "skylake-avx512")
	.Default(nullptr);
	}
	if (CPU) {
	A->claim();
	return CPU;
	}
	}

	// Select the default CPU if none was given (or detection failed).

	if (!Triple.isX86())
	return nullptr; // This routine is only handling x86 targets.

	bool Is64Bit = Triple.getArch() == llvm::Triple::x86_64;

	// FIXME: Need target hooks.
	if (Triple.isOSDarwin()) {
	if (Triple.getArchName() == "x86_64h")
	return "core-avx2";
	// macosx10.12 drops support for all pre-Penryn Macs.
	// Simulators can still run on 10.11 though, like Xcode.
	if (Triple.isMacOSX() && !Triple.isOSVersionLT(10, 12))
	return "penryn";
	// The oldest x86_64 Macs have core2/Merom; the oldest x86 Macs have Yonah.
	return Is64Bit ? "core2" : "yonah";
	}

	// Set up default CPU name for PS4 compilers.
	if (Triple.isPS4CPU())
	return "btver2";

	// On Android use targets compatible with gcc
	if (Triple.isAndroid())
	return Is64Bit ? "x86-64" : "i686";

	// Everything else goes to x86-64 in 64-bit mode.
	if (Is64Bit)
	return "x86-64";

	switch (Triple.getOS()) {
	case llvm::Triple::FreeBSD:
	return "i686";
	case llvm::Triple::NetBSD:
	case llvm::Triple::OpenBSD:
	return "i486";
	case llvm::Triple::Haiku:
	return "i586";
	default:
	// Fallback to p4.
	return "pentium4";
	}
	}

	void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	// If -march=native, autodetect the feature list.
	if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
	if (StringRef(A->getValue()) == "native") {
	llvm::StringMap<bool> HostFeatures;
	if (llvm::sys::getHostCPUFeatures(HostFeatures))
	for (auto &F : HostFeatures)
	Features.push_back(
	Args.MakeArgString((F.second ? "+" : "-") + F.first()));
	}
	}

	if (Triple.getArchName() == "x86_64h") {
	// x86_64h implies quite a few of the more modern subtarget features
	// for Haswell class CPUs, but not all of them. Opt-out of a few.
	Features.push_back("-rdrnd");
	Features.push_back("-aes");
	Features.push_back("-pclmul");
	Features.push_back("-rtm");
	Features.push_back("-fsgsbase");
	}

	const llvm::Triple::ArchType ArchType = Triple.getArch();
	// Add features to be compatible with gcc for Android.
	if (Triple.isAndroid()) {
	if (ArchType == llvm::Triple::x86_64) {
	Features.push_back("+sse4.2");
	Features.push_back("+popcnt");
	Features.push_back("+cx16");
	} else
	Features.push_back("+ssse3");
	}

	// Translate the high level `-mretpoline` flag to the specific target feature
	// flags. We also detect if the user asked for retpoline external thunks but
	// failed to ask for retpolines themselves (through any of the different
	// flags). This is a bit hacky but keeps existing usages working. We should
	// consider deprecating this and instead warn if the user requests external
	// retpoline thunks and doesn't request some form of retpolines.
	+ auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
	if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
	options::OPT_mspeculative_load_hardening,
	options::OPT_mno_speculative_load_hardening)) {
	if (Args.hasFlag(options::OPT_mretpoline, options::OPT_mno_retpoline,
	false)) {
	Features.push_back("+retpoline-indirect-calls");
	Features.push_back("+retpoline-indirect-branches");
	+ SpectreOpt = options::OPT_mretpoline;
	} else if (Args.hasFlag(options::OPT_mspeculative_load_hardening,
	options::OPT_mno_speculative_load_hardening,
	false)) {
	// On x86, speculative load hardening relies on at least using retpolines
	// for indirect calls.
	Features.push_back("+retpoline-indirect-calls");
	+ SpectreOpt = options::OPT_mspeculative_load_hardening;
	}
	} else if (Args.hasFlag(options::OPT_mretpoline_external_thunk,
	options::OPT_mno_retpoline_external_thunk, false)) {
	// FIXME: Add a warning about failing to specify `-mretpoline` and
	// eventually switch to an error here.
	Features.push_back("+retpoline-indirect-calls");
	Features.push_back("+retpoline-indirect-branches");
	+ SpectreOpt = options::OPT_mretpoline_external_thunk;
	+ }
	+
	+ auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
	+ if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening,
	+ false)) {
	+ Features.push_back("+lvi-load-hardening");
	+ Features.push_back("+lvi-cfi"); // load hardening implies CFI protection
	+ LVIOpt = options::OPT_mlvi_hardening;
	+ } else if (Args.hasFlag(options::OPT_mlvi_cfi, options::OPT_mno_lvi_cfi,
	+ false)) {
	+ Features.push_back("+lvi-cfi");
	+ LVIOpt = options::OPT_mlvi_cfi;
	+ }
	+
	+ if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
	+ LVIOpt != clang::driver::options::ID::OPT_INVALID) {
	+ D.Diag(diag::err_drv_argument_not_allowed_with)
	+ << D.getOpts().getOptionName(SpectreOpt)
	+ << D.getOpts().getOptionName(LVIOpt);
	}

	// Now add any that the user explicitly requested on the command line,
	// which may override the defaults.
	handleTargetFeaturesGroup(Args, Features, options::OPT_m_x86_Features_Group);
	}
	Index: head/contrib/llvm-project/clang
	===================================================================
	--- head/contrib/llvm-project/clang (revision 362608)
	+++ head/contrib/llvm-project/clang (revision 362609)

	Property changes on: head/contrib/llvm-project/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm-project/release-10.x/clang:r362444-362594
	Index: head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h
	===================================================================
	--- head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h (nonexistent)
	+++ head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h (revision 362609)
	@@ -0,0 +1,968 @@
	+//===- RDFGraph.h ------------------------------------------------ C++ --===//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+//
	+// Target-independent, SSA-based data flow graph for register data flow (RDF)
	+// for a non-SSA program representation (e.g. post-RA machine code).
	+//
	+//
	+// *** Introduction
	+//
	+// The RDF graph is a collection of nodes, each of which denotes some element
	+// of the program. There are two main types of such elements: code and refe-
	+// rences. Conceptually, "code" is something that represents the structure
	+// of the program, e.g. basic block or a statement, while "reference" is an
	+// instance of accessing a register, e.g. a definition or a use. Nodes are
	+// connected with each other based on the structure of the program (such as
	+// blocks, instructions, etc.), and based on the data flow (e.g. reaching
	+// definitions, reached uses, etc.). The single-reaching-definition principle
	+// of SSA is generally observed, although, due to the non-SSA representation
	+// of the program, there are some differences between the graph and a "pure"
	+// SSA representation.
	+//
	+//
	+// *** Implementation remarks
	+//
	+// Since the graph can contain a large number of nodes, memory consumption
	+// was one of the major design considerations. As a result, there is a single
	+// base class NodeBase which defines all members used by all possible derived
	+// classes. The members are arranged in a union, and a derived class cannot
	+// add any data members of its own. Each derived class only defines the
	+// functional interface, i.e. member functions. NodeBase must be a POD,
	+// which implies that all of its members must also be PODs.
	+// Since nodes need to be connected with other nodes, pointers have been
	+// replaced with 32-bit identifiers: each node has an id of type NodeId.
	+// There are mapping functions in the graph that translate between actual
	+// memory addresses and the corresponding identifiers.
	+// A node id of 0 is equivalent to nullptr.
	+//
	+//
	+// *** Structure of the graph
	+//
	+// A code node is always a collection of other nodes. For example, a code
	+// node corresponding to a basic block will contain code nodes corresponding
	+// to instructions. In turn, a code node corresponding to an instruction will
	+// contain a list of reference nodes that correspond to the definitions and
	+// uses of registers in that instruction. The members are arranged into a
	+// circular list, which is yet another consequence of the effort to save
	+// memory: for each member node it should be possible to obtain its owner,
	+// and it should be possible to access all other members. There are other
	+// ways to accomplish that, but the circular list seemed the most natural.
	+//
	+// +- CodeNode -+
	+// \| \| <---------------------------------------------------+
	+// +-+--------+-+ \|
	+// \|FirstM \|LastM \|
	+// \| +-------------------------------------+ \|
	+// \| \| \|
	+// V V \|
	+// +----------+ Next +----------+ Next Next +----------+ Next \|
	+// \| \|----->\| \|-----> ... ----->\| \|----->-+
	+// +- Member -+ +- Member -+ +- Member -+
	+//
	+// The order of members is such that related reference nodes (see below)
	+// should be contiguous on the member list.
	+//
	+// A reference node is a node that encapsulates an access to a register,
	+// in other words, data flowing into or out of a register. There are two
	+// major kinds of reference nodes: defs and uses. A def node will contain
	+// the id of the first reached use, and the id of the first reached def.
	+// Each def and use will contain the id of the reaching def, and also the
	+// id of the next reached def (for def nodes) or use (for use nodes).
	+// The "next node sharing the same reaching def" is denoted as "sibling".
	+// In summary:
	+// - Def node contains: reaching def, sibling, first reached def, and first
	+// reached use.
	+// - Use node contains: reaching def and sibling.
	+//
	+// +-- DefNode --+
	+// \| R2 = ... \| <---+--------------------+
	+// ++---------+--+ \| \|
	+// \|Reached \|Reached \| \|
	+// \|Def \|Use \| \|
	+// \| \| \|Reaching \|Reaching
	+// \| V \|Def \|Def
	+// \| +-- UseNode --+ Sib +-- UseNode --+ Sib Sib
	+// \| \| ... = R2 \|----->\| ... = R2 \|----> ... ----> 0
	+// \| +-------------+ +-------------+
	+// V
	+// +-- DefNode --+ Sib
	+// \| R2 = ... \|----> ...
	+// ++---------+--+
	+// \| \|
	+// \| \|
	+// ... ...
	+//
	+// To get a full picture, the circular lists connecting blocks within a
	+// function, instructions within a block, etc. should be superimposed with
	+// the def-def, def-use links shown above.
	+// To illustrate this, consider a small example in a pseudo-assembly:
	+// foo:
	+// add r2, r0, r1 ; r2 = r0+r1
	+// addi r0, r2, 1 ; r0 = r2+1
	+// ret r0 ; return value in r0
	+//
	+// The graph (in a format used by the debugging functions) would look like:
	+//
	+// DFG dump:[
	+// f1: Function foo
	+// b2: === %bb.0 === preds(0), succs(0):
	+// p3: phi [d4<r0>(,d12,u9):]
	+// p5: phi [d6<r1>(,,u10):]
	+// s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
	+// s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
	+// s14: ret [u15<r0>(d12):]
	+// ]
	+//
	+// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
	+// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
	+// ment, d - def, u - use).
	+// The format of a def node is:
	+// dN<R>(rd,d,u):sib,
	+// where
	+// N - numeric node id,
	+// R - register being defined
	+// rd - reaching def,
	+// d - reached def,
	+// u - reached use,
	+// sib - sibling.
	+// The format of a use node is:
	+// uN<R>[!](rd):sib,
	+// where
	+// N - numeric node id,
	+// R - register being used,
	+// rd - reaching def,
	+// sib - sibling.
	+// Possible annotations (usually preceding the node id):
	+// + - preserving def,
	+// ~ - clobbering def,
	+// " - shadow ref (follows the node id),
	+// ! - fixed register (appears after register name).
	+//
	+// The circular lists are not explicit in the dump.
	+//
	+//
	+// *** Node attributes
	+//
	+// NodeBase has a member "Attrs", which is the primary way of determining
	+// the node's characteristics. The fields in this member decide whether
	+// the node is a code node or a reference node (i.e. node's "type"), then
	+// within each type, the "kind" determines what specifically this node
	+// represents. The remaining bits, "flags", contain additional information
	+// that is even more detailed than the "kind".
	+// CodeNode's kinds are:
	+// - Phi: Phi node, members are reference nodes.
	+// - Stmt: Statement, members are reference nodes.
	+// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
	+// - Func: The whole function. The members are basic block nodes.
	+// RefNode's kinds are:
	+// - Use.
	+// - Def.
	+//
	+// Meaning of flags:
	+// - Preserving: applies only to defs. A preserving def is one that can
	+// preserve some of the original bits among those that are included in
	+// the register associated with that def. For example, if R0 is a 32-bit
	+// register, but a def can only change the lower 16 bits, then it will
	+// be marked as preserving.
	+// - Shadow: a reference that has duplicates holding additional reaching
	+// defs (see more below).
	+// - Clobbering: applied only to defs, indicates that the value generated
	+// by this def is unspecified. A typical example would be volatile registers
	+// after function calls.
	+// - Fixed: the register in this def/use cannot be replaced with any other
	+// register. A typical case would be a parameter register to a call, or
	+// the register with the return value from a function.
	+// - Undef: the register in this reference the register is assumed to have
	+// no pre-existing value, even if it appears to be reached by some def.
	+// This is typically used to prevent keeping registers artificially live
	+// in cases when they are defined via predicated instructions. For example:
	+// r0 = add-if-true cond, r10, r11 (1)
	+// r0 = add-if-false cond, r12, r13, implicit r0 (2)
	+// ... = r0 (3)
	+// Before (1), r0 is not intended to be live, and the use of r0 in (3) is
	+// not meant to be reached by any def preceding (1). However, since the
	+// defs in (1) and (2) are both preserving, these properties alone would
	+// imply that the use in (3) may indeed be reached by some prior def.
	+// Adding Undef flag to the def in (1) prevents that. The Undef flag
	+// may be applied to both defs and uses.
	+// - Dead: applies only to defs. The value coming out of a "dead" def is
	+// assumed to be unused, even if the def appears to be reaching other defs
	+// or uses. The motivation for this flag comes from dead defs on function
	+// calls: there is no way to determine if such a def is dead without
	+// analyzing the target's ABI. Hence the graph should contain this info,
	+// as it is unavailable otherwise. On the other hand, a def without any
	+// uses on a typical instruction is not the intended target for this flag.
	+//
	+// *** Shadow references
	+//
	+// It may happen that a super-register can have two (or more) non-overlapping
	+// sub-registers. When both of these sub-registers are defined and followed
	+// by a use of the super-register, the use of the super-register will not
	+// have a unique reaching def: both defs of the sub-registers need to be
	+// accounted for. In such cases, a duplicate use of the super-register is
	+// added and it points to the extra reaching def. Both uses are marked with
	+// a flag "shadow". Example:
	+// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
	+// set r0, 1 ; r0 = 1
	+// set r1, 1 ; r1 = 1
	+// addi t1, t0, 1 ; t1 = t0+1
	+//
	+// The DFG:
	+// s1: set [d2<r0>(,,u9):]
	+// s3: set [d4<r1>(,,u10):]
	+// s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
	+//
	+// The statement s5 has two use nodes for t0: u7" and u9". The quotation
	+// mark " indicates that the node is a shadow.
	+//
	+
	+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
	+#define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
	+
	+#include "RDFRegisters.h"
	+#include "llvm/ADT/SmallVector.h"
	+#include "llvm/MC/LaneBitmask.h"
	+#include "llvm/Support/Allocator.h"
	+#include "llvm/Support/MathExtras.h"
	+#include <cassert>
	+#include <cstdint>
	+#include <cstring>
	+#include <map>
	+#include <set>
	+#include <unordered_map>
	+#include <utility>
	+#include <vector>
	+
	+// RDF uses uint32_t to refer to registers. This is to ensure that the type
	+// size remains specific. In other places, registers are often stored using
	+// unsigned.
	+static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
	+
	+namespace llvm {
	+
	+class MachineBasicBlock;
	+class MachineDominanceFrontier;
	+class MachineDominatorTree;
	+class MachineFunction;
	+class MachineInstr;
	+class MachineOperand;
	+class raw_ostream;
	+class TargetInstrInfo;
	+class TargetRegisterInfo;
	+
	+namespace rdf {
	+
	+ using NodeId = uint32_t;
	+
	+ struct DataFlowGraph;
	+
	+ struct NodeAttrs {
	+ enum : uint16_t {
	+ None = 0x0000, // Nothing
	+
	+ // Types: 2 bits
	+ TypeMask = 0x0003,
	+ Code = 0x0001, // 01, Container
	+ Ref = 0x0002, // 10, Reference
	+
	+ // Kind: 3 bits
	+ KindMask = 0x0007 << 2,
	+ Def = 0x0001 << 2, // 001
	+ Use = 0x0002 << 2, // 010
	+ Phi = 0x0003 << 2, // 011
	+ Stmt = 0x0004 << 2, // 100
	+ Block = 0x0005 << 2, // 101
	+ Func = 0x0006 << 2, // 110
	+
	+ // Flags: 7 bits for now
	+ FlagMask = 0x007F << 5,
	+ Shadow = 0x0001 << 5, // 0000001, Has extra reaching defs.
	+ Clobbering = 0x0002 << 5, // 0000010, Produces unspecified values.
	+ PhiRef = 0x0004 << 5, // 0000100, Member of PhiNode.
	+ Preserving = 0x0008 << 5, // 0001000, Def can keep original bits.
	+ Fixed = 0x0010 << 5, // 0010000, Fixed register.
	+ Undef = 0x0020 << 5, // 0100000, Has no pre-existing value.
	+ Dead = 0x0040 << 5, // 1000000, Does not define a value.
	+ };
	+
	+ static uint16_t type(uint16_t T) { return T & TypeMask; }
	+ static uint16_t kind(uint16_t T) { return T & KindMask; }
	+ static uint16_t flags(uint16_t T) { return T & FlagMask; }
	+
	+ static uint16_t set_type(uint16_t A, uint16_t T) {
	+ return (A & ~TypeMask) \| T;
	+ }
	+
	+ static uint16_t set_kind(uint16_t A, uint16_t K) {
	+ return (A & ~KindMask) \| K;
	+ }
	+
	+ static uint16_t set_flags(uint16_t A, uint16_t F) {
	+ return (A & ~FlagMask) \| F;
	+ }
	+
	+ // Test if A contains B.
	+ static bool contains(uint16_t A, uint16_t B) {
	+ if (type(A) != Code)
	+ return false;
	+ uint16_t KB = kind(B);
	+ switch (kind(A)) {
	+ case Func:
	+ return KB == Block;
	+ case Block:
	+ return KB == Phi \|\| KB == Stmt;
	+ case Phi:
	+ case Stmt:
	+ return type(B) == Ref;
	+ }
	+ return false;
	+ }
	+ };
	+
	+ struct BuildOptions {
	+ enum : unsigned {
	+ None = 0x00,
	+ KeepDeadPhis = 0x01, // Do not remove dead phis during build.
	+ };
	+ };
	+
	+ template <typename T> struct NodeAddr {
	+ NodeAddr() = default;
	+ NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
	+
	+ // Type cast (casting constructor). The reason for having this class
	+ // instead of std::pair.
	+ template <typename S> NodeAddr(const NodeAddr<S> &NA)
	+ : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
	+
	+ bool operator== (const NodeAddr<T> &NA) const {
	+ assert((Addr == NA.Addr) == (Id == NA.Id));
	+ return Addr == NA.Addr;
	+ }
	+ bool operator!= (const NodeAddr<T> &NA) const {
	+ return !operator==(NA);
	+ }
	+
	+ T Addr = nullptr;
	+ NodeId Id = 0;
	+ };
	+
	+ struct NodeBase;
	+
	+ // Fast memory allocation and translation between node id and node address.
	+ // This is really the same idea as the one underlying the "bump pointer
	+ // allocator", the difference being in the translation. A node id is
	+ // composed of two components: the index of the block in which it was
	+ // allocated, and the index within the block. With the default settings,
	+ // where the number of nodes per block is 4096, the node id (minus 1) is:
	+ //
	+ // bit position: 11 0
	+ // +----------------------------+--------------+
	+ // \| Index of the block \|Index in block\|
	+ // +----------------------------+--------------+
	+ //
	+ // The actual node id is the above plus 1, to avoid creating a node id of 0.
	+ //
	+ // This method significantly improved the build time, compared to using maps
	+ // (std::unordered_map or DenseMap) to translate between pointers and ids.
	+ struct NodeAllocator {
	+ // Amount of storage for a single node.
	+ enum { NodeMemSize = 32 };
	+
	+ NodeAllocator(uint32_t NPB = 4096)
	+ : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
	+ IndexMask((1 << BitsPerIndex)-1) {
	+ assert(isPowerOf2_32(NPB));
	+ }
	+
	+ NodeBase *ptr(NodeId N) const {
	+ uint32_t N1 = N-1;
	+ uint32_t BlockN = N1 >> BitsPerIndex;
	+ uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
	+ return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
	+ }
	+
	+ NodeId id(const NodeBase *P) const;
	+ NodeAddr<NodeBase*> New();
	+ void clear();
	+
	+ private:
	+ void startNewBlock();
	+ bool needNewBlock();
	+
	+ uint32_t makeId(uint32_t Block, uint32_t Index) const {
	+ // Add 1 to the id, to avoid the id of 0, which is treated as "null".
	+ return ((Block << BitsPerIndex) \| Index) + 1;
	+ }
	+
	+ const uint32_t NodesPerBlock;
	+ const uint32_t BitsPerIndex;
	+ const uint32_t IndexMask;
	+ char *ActiveEnd = nullptr;
	+ std::vector<char*> Blocks;
	+ using AllocatorTy = BumpPtrAllocatorImpl<MallocAllocator, 65536>;
	+ AllocatorTy MemPool;
	+ };
	+
	+ using RegisterSet = std::set<RegisterRef>;
	+
	+ struct TargetOperandInfo {
	+ TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
	+ virtual ~TargetOperandInfo() = default;
	+
	+ virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
	+ virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
	+ virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
	+
	+ const TargetInstrInfo &TII;
	+ };
	+
	+ // Packed register reference. Only used for storage.
	+ struct PackedRegisterRef {
	+ RegisterId Reg;
	+ uint32_t MaskId;
	+ };
	+
	+ struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
	+ LaneMaskIndex() = default;
	+
	+ LaneBitmask getLaneMaskForIndex(uint32_t K) const {
	+ return K == 0 ? LaneBitmask::getAll() : get(K);
	+ }
	+
	+ uint32_t getIndexForLaneMask(LaneBitmask LM) {
	+ assert(LM.any());
	+ return LM.all() ? 0 : insert(LM);
	+ }
	+
	+ uint32_t getIndexForLaneMask(LaneBitmask LM) const {
	+ assert(LM.any());
	+ return LM.all() ? 0 : find(LM);
	+ }
	+ };
	+
	+ struct NodeBase {
	+ public:
	+ // Make sure this is a POD.
	+ NodeBase() = default;
	+
	+ uint16_t getType() const { return NodeAttrs::type(Attrs); }
	+ uint16_t getKind() const { return NodeAttrs::kind(Attrs); }
	+ uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
	+ NodeId getNext() const { return Next; }
	+
	+ uint16_t getAttrs() const { return Attrs; }
	+ void setAttrs(uint16_t A) { Attrs = A; }
	+ void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
	+
	+ // Insert node NA after "this" in the circular chain.
	+ void append(NodeAddr<NodeBase*> NA);
	+
	+ // Initialize all members to 0.
	+ void init() { memset(this, 0, sizeof *this); }
	+
	+ void setNext(NodeId N) { Next = N; }
	+
	+ protected:
	+ uint16_t Attrs;
	+ uint16_t Reserved;
	+ NodeId Next; // Id of the next node in the circular chain.
	+ // Definitions of nested types. Using anonymous nested structs would make
	+ // this class definition clearer, but unnamed structs are not a part of
	+ // the standard.
	+ struct Def_struct {
	+ NodeId DD, DU; // Ids of the first reached def and use.
	+ };
	+ struct PhiU_struct {
	+ NodeId PredB; // Id of the predecessor block for a phi use.
	+ };
	+ struct Code_struct {
	+ void *CP; // Pointer to the actual code.
	+ NodeId FirstM, LastM; // Id of the first member and last.
	+ };
	+ struct Ref_struct {
	+ NodeId RD, Sib; // Ids of the reaching def and the sibling.
	+ union {
	+ Def_struct Def;
	+ PhiU_struct PhiU;
	+ };
	+ union {
	+ MachineOperand *Op; // Non-phi refs point to a machine operand.
	+ PackedRegisterRef PR; // Phi refs store register info directly.
	+ };
	+ };
	+
	+ // The actual payload.
	+ union {
	+ Ref_struct Ref;
	+ Code_struct Code;
	+ };
	+ };
	+ // The allocator allocates chunks of 32 bytes for each node. The fact that
	+ // each node takes 32 bytes in memory is used for fast translation between
	+ // the node id and the node address.
	+ static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
	+ "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
	+
	+ using NodeList = SmallVector<NodeAddr<NodeBase *>, 4>;
	+ using NodeSet = std::set<NodeId>;
	+
	+ struct RefNode : public NodeBase {
	+ RefNode() = default;
	+
	+ RegisterRef getRegRef(const DataFlowGraph &G) const;
	+
	+ MachineOperand &getOp() {
	+ assert(!(getFlags() & NodeAttrs::PhiRef));
	+ return *Ref.Op;
	+ }
	+
	+ void setRegRef(RegisterRef RR, DataFlowGraph &G);
	+ void setRegRef(MachineOperand *Op, DataFlowGraph &G);
	+
	+ NodeId getReachingDef() const {
	+ return Ref.RD;
	+ }
	+ void setReachingDef(NodeId RD) {
	+ Ref.RD = RD;
	+ }
	+
	+ NodeId getSibling() const {
	+ return Ref.Sib;
	+ }
	+ void setSibling(NodeId Sib) {
	+ Ref.Sib = Sib;
	+ }
	+
	+ bool isUse() const {
	+ assert(getType() == NodeAttrs::Ref);
	+ return getKind() == NodeAttrs::Use;
	+ }
	+
	+ bool isDef() const {
	+ assert(getType() == NodeAttrs::Ref);
	+ return getKind() == NodeAttrs::Def;
	+ }
	+
	+ template <typename Predicate>
	+ NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
	+ const DataFlowGraph &G);
	+ NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
	+ };
	+
	+ struct DefNode : public RefNode {
	+ NodeId getReachedDef() const {
	+ return Ref.Def.DD;
	+ }
	+ void setReachedDef(NodeId D) {
	+ Ref.Def.DD = D;
	+ }
	+ NodeId getReachedUse() const {
	+ return Ref.Def.DU;
	+ }
	+ void setReachedUse(NodeId U) {
	+ Ref.Def.DU = U;
	+ }
	+
	+ void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
	+ };
	+
	+ struct UseNode : public RefNode {
	+ void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
	+ };
	+
	+ struct PhiUseNode : public UseNode {
	+ NodeId getPredecessor() const {
	+ assert(getFlags() & NodeAttrs::PhiRef);
	+ return Ref.PhiU.PredB;
	+ }
	+ void setPredecessor(NodeId B) {
	+ assert(getFlags() & NodeAttrs::PhiRef);
	+ Ref.PhiU.PredB = B;
	+ }
	+ };
	+
	+ struct CodeNode : public NodeBase {
	+ template <typename T> T getCode() const {
	+ return static_cast<T>(Code.CP);
	+ }
	+ void setCode(void *C) {
	+ Code.CP = C;
	+ }
	+
	+ NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
	+ NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
	+ void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
	+ void addMemberAfter(NodeAddr<NodeBase> MA, NodeAddr<NodeBase> NA,
	+ const DataFlowGraph &G);
	+ void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
	+
	+ NodeList members(const DataFlowGraph &G) const;
	+ template <typename Predicate>
	+ NodeList members_if(Predicate P, const DataFlowGraph &G) const;
	+ };
	+
	+ struct InstrNode : public CodeNode {
	+ NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
	+ };
	+
	+ struct PhiNode : public InstrNode {
	+ MachineInstr *getCode() const {
	+ return nullptr;
	+ }
	+ };
	+
	+ struct StmtNode : public InstrNode {
	+ MachineInstr *getCode() const {
	+ return CodeNode::getCode<MachineInstr*>();
	+ }
	+ };
	+
	+ struct BlockNode : public CodeNode {
	+ MachineBasicBlock *getCode() const {
	+ return CodeNode::getCode<MachineBasicBlock*>();
	+ }
	+
	+ void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
	+ };
	+
	+ struct FuncNode : public CodeNode {
	+ MachineFunction *getCode() const {
	+ return CodeNode::getCode<MachineFunction*>();
	+ }
	+
	+ NodeAddr<BlockNode> findBlock(const MachineBasicBlock BB,
	+ const DataFlowGraph &G) const;
	+ NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
	+ };
	+
	+ struct DataFlowGraph {
	+ DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
	+ const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
	+ const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi);
	+
	+ NodeBase *ptr(NodeId N) const;
	+ template <typename T> T ptr(NodeId N) const {
	+ return static_cast<T>(ptr(N));
	+ }
	+
	+ NodeId id(const NodeBase *P) const;
	+
	+ template <typename T> NodeAddr<T> addr(NodeId N) const {
	+ return { ptr<T>(N), N };
	+ }
	+
	+ NodeAddr<FuncNode*> getFunc() const { return Func; }
	+ MachineFunction &getMF() const { return MF; }
	+ const TargetInstrInfo &getTII() const { return TII; }
	+ const TargetRegisterInfo &getTRI() const { return TRI; }
	+ const PhysicalRegisterInfo &getPRI() const { return PRI; }
	+ const MachineDominatorTree &getDT() const { return MDT; }
	+ const MachineDominanceFrontier &getDF() const { return MDF; }
	+ const RegisterAggr &getLiveIns() const { return LiveIns; }
	+
	+ struct DefStack {
	+ DefStack() = default;
	+
	+ bool empty() const { return Stack.empty() \|\| top() == bottom(); }
	+
	+ private:
	+ using value_type = NodeAddr<DefNode *>;
	+ struct Iterator {
	+ using value_type = DefStack::value_type;
	+
	+ Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
	+ Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
	+
	+ value_type operator*() const {
	+ assert(Pos >= 1);
	+ return DS.Stack[Pos-1];
	+ }
	+ const value_type *operator->() const {
	+ assert(Pos >= 1);
	+ return &DS.Stack[Pos-1];
	+ }
	+ bool operator==(const Iterator &It) const { return Pos == It.Pos; }
	+ bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
	+
	+ private:
	+ friend struct DefStack;
	+
	+ Iterator(const DefStack &S, bool Top);
	+
	+ // Pos-1 is the index in the StorageType object that corresponds to
	+ // the top of the DefStack.
	+ const DefStack &DS;
	+ unsigned Pos;
	+ };
	+
	+ public:
	+ using iterator = Iterator;
	+
	+ iterator top() const { return Iterator(*this, true); }
	+ iterator bottom() const { return Iterator(*this, false); }
	+ unsigned size() const;
	+
	+ void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
	+ void pop();
	+ void start_block(NodeId N);
	+ void clear_block(NodeId N);
	+
	+ private:
	+ friend struct Iterator;
	+
	+ using StorageType = std::vector<value_type>;
	+
	+ bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
	+ return (P.Addr == nullptr) && (N == 0 \|\| P.Id == N);
	+ }
	+
	+ unsigned nextUp(unsigned P) const;
	+ unsigned nextDown(unsigned P) const;
	+
	+ StorageType Stack;
	+ };
	+
	+ // Make this std::unordered_map for speed of accessing elements.
	+ // Map: Register (physical or virtual) -> DefStack
	+ using DefStackMap = std::unordered_map<RegisterId, DefStack>;
	+
	+ void build(unsigned Options = BuildOptions::None);
	+ void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
	+ void markBlock(NodeId B, DefStackMap &DefM);
	+ void releaseBlock(NodeId B, DefStackMap &DefM);
	+
	+ PackedRegisterRef pack(RegisterRef RR) {
	+ return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
	+ }
	+ PackedRegisterRef pack(RegisterRef RR) const {
	+ return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
	+ }
	+ RegisterRef unpack(PackedRegisterRef PR) const {
	+ return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId));
	+ }
	+
	+ RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
	+ RegisterRef makeRegRef(const MachineOperand &Op) const;
	+ RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
	+
	+ NodeAddr<RefNode> getNextRelated(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA) const;
	+ NodeAddr<RefNode> getNextImp(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA, bool Create);
	+ NodeAddr<RefNode> getNextImp(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA) const;
	+ NodeAddr<RefNode> getNextShadow(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA, bool Create);
	+ NodeAddr<RefNode> getNextShadow(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA) const;
	+
	+ NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
	+ NodeAddr<RefNode*> RA) const;
	+
	+ NodeAddr<BlockNode> findBlock(MachineBasicBlock BB) const {
	+ return BlockNodes.at(BB);
	+ }
	+
	+ void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
	+ unlinkUseDF(UA);
	+ if (RemoveFromOwner)
	+ removeFromOwner(UA);
	+ }
	+
	+ void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
	+ unlinkDefDF(DA);
	+ if (RemoveFromOwner)
	+ removeFromOwner(DA);
	+ }
	+
	+ // Some useful filters.
	+ template <uint16_t Kind>
	+ static bool IsRef(const NodeAddr<NodeBase*> BA) {
	+ return BA.Addr->getType() == NodeAttrs::Ref &&
	+ BA.Addr->getKind() == Kind;
	+ }
	+
	+ template <uint16_t Kind>
	+ static bool IsCode(const NodeAddr<NodeBase*> BA) {
	+ return BA.Addr->getType() == NodeAttrs::Code &&
	+ BA.Addr->getKind() == Kind;
	+ }
	+
	+ static bool IsDef(const NodeAddr<NodeBase*> BA) {
	+ return BA.Addr->getType() == NodeAttrs::Ref &&
	+ BA.Addr->getKind() == NodeAttrs::Def;
	+ }
	+
	+ static bool IsUse(const NodeAddr<NodeBase*> BA) {
	+ return BA.Addr->getType() == NodeAttrs::Ref &&
	+ BA.Addr->getKind() == NodeAttrs::Use;
	+ }
	+
	+ static bool IsPhi(const NodeAddr<NodeBase*> BA) {
	+ return BA.Addr->getType() == NodeAttrs::Code &&
	+ BA.Addr->getKind() == NodeAttrs::Phi;
	+ }
	+
	+ static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
	+ uint16_t Flags = DA.Addr->getFlags();
	+ return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
	+ }
	+
	+ private:
	+ void reset();
	+
	+ RegisterSet getLandingPadLiveIns() const;
	+
	+ NodeAddr<NodeBase*> newNode(uint16_t Attrs);
	+ NodeAddr<NodeBase> cloneNode(const NodeAddr<NodeBase> B);
	+ NodeAddr<UseNode> newUse(NodeAddr<InstrNode> Owner,
	+ MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
	+ NodeAddr<PhiUseNode> newPhiUse(NodeAddr<PhiNode> Owner,
	+ RegisterRef RR, NodeAddr<BlockNode*> PredB,
	+ uint16_t Flags = NodeAttrs::PhiRef);
	+ NodeAddr<DefNode> newDef(NodeAddr<InstrNode> Owner,
	+ MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
	+ NodeAddr<DefNode> newDef(NodeAddr<InstrNode> Owner,
	+ RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
	+ NodeAddr<PhiNode> newPhi(NodeAddr<BlockNode> Owner);
	+ NodeAddr<StmtNode> newStmt(NodeAddr<BlockNode> Owner,
	+ MachineInstr *MI);
	+ NodeAddr<BlockNode> newBlock(NodeAddr<FuncNode> Owner,
	+ MachineBasicBlock *BB);
	+ NodeAddr<FuncNode> newFunc(MachineFunction MF);
	+
	+ template <typename Predicate>
	+ std::pair<NodeAddr<RefNode>,NodeAddr<RefNode>>
	+ locateNextRef(NodeAddr<InstrNode> IA, NodeAddr<RefNode> RA,
	+ Predicate P) const;
	+
	+ using BlockRefsMap = std::map<NodeId, RegisterSet>;
	+
	+ void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
	+ void recordDefsForDF(BlockRefsMap &PhiM, NodeAddr<BlockNode*> BA);
	+ void buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
	+ NodeAddr<BlockNode*> BA);
	+ void removeUnusedPhis();
	+
	+ void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
	+ void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
	+ template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
	+ NodeAddr<T> TA, DefStack &DS);
	+ template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
	+ NodeAddr<StmtNode*> SA, Predicate P);
	+ void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
	+
	+ void unlinkUseDF(NodeAddr<UseNode*> UA);
	+ void unlinkDefDF(NodeAddr<DefNode*> DA);
	+
	+ void removeFromOwner(NodeAddr<RefNode*> RA) {
	+ NodeAddr<InstrNode> IA = RA.Addr->getOwner(this);
	+ IA.Addr->removeMember(RA, *this);
	+ }
	+
	+ MachineFunction &MF;
	+ const TargetInstrInfo &TII;
	+ const TargetRegisterInfo &TRI;
	+ const PhysicalRegisterInfo PRI;
	+ const MachineDominatorTree &MDT;
	+ const MachineDominanceFrontier &MDF;
	+ const TargetOperandInfo &TOI;
	+
	+ RegisterAggr LiveIns;
	+ NodeAddr<FuncNode*> Func;
	+ NodeAllocator Memory;
	+ // Local map: MachineBasicBlock -> NodeAddr<BlockNode*>
	+ std::map<MachineBasicBlock,NodeAddr<BlockNode>> BlockNodes;
	+ // Lane mask map.
	+ LaneMaskIndex LMI;
	+ }; // struct DataFlowGraph
	+
	+ template <typename Predicate>
	+ NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
	+ bool NextOnly, const DataFlowGraph &G) {
	+ // Get the "Next" reference in the circular list that references RR and
	+ // satisfies predicate "Pred".
	+ auto NA = G.addr<NodeBase*>(getNext());
	+
	+ while (NA.Addr != this) {
	+ if (NA.Addr->getType() == NodeAttrs::Ref) {
	+ NodeAddr<RefNode*> RA = NA;
	+ if (RA.Addr->getRegRef(G) == RR && P(NA))
	+ return NA;
	+ if (NextOnly)
	+ break;
	+ NA = G.addr<NodeBase*>(NA.Addr->getNext());
	+ } else {
	+ // We've hit the beginning of the chain.
	+ assert(NA.Addr->getType() == NodeAttrs::Code);
	+ NodeAddr<CodeNode*> CA = NA;
	+ NA = CA.Addr->getFirstMember(G);
	+ }
	+ }
	+ // Return the equivalent of "nullptr" if such a node was not found.
	+ return NodeAddr<RefNode*>();
	+ }
	+
	+ template <typename Predicate>
	+ NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
	+ NodeList MM;
	+ auto M = getFirstMember(G);
	+ if (M.Id == 0)
	+ return MM;
	+
	+ while (M.Addr != this) {
	+ if (P(M))
	+ MM.push_back(M);
	+ M = G.addr<NodeBase*>(M.Addr->getNext());
	+ }
	+ return MM;
	+ }
	+
	+ template <typename T>
	+ struct Print {
	+ Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
	+
	+ const T &Obj;
	+ const DataFlowGraph &G;
	+ };
	+
	+ template <typename T>
	+ struct PrintNode : Print<NodeAddr<T>> {
	+ PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
	+ : Print<NodeAddr<T>>(x, g) {}
	+ };
	+
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<DefNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<UseNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS,
	+ const Print<NodeAddr<PhiUseNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<RefNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<PhiNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS,
	+ const Print<NodeAddr<StmtNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS,
	+ const Print<NodeAddr<InstrNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS,
	+ const Print<NodeAddr<BlockNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS,
	+ const Print<NodeAddr<FuncNode *>> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P);
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P);
	+ raw_ostream &operator<<(raw_ostream &OS,
	+ const Print<DataFlowGraph::DefStack> &P);
	+
	+} // end namespace rdf
	+
	+} // end namespace llvm
	+
	+#endif // LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H

	Property changes on: head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h
	===================================================================
	--- head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h (nonexistent)
	+++ head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h (revision 362609)
	@@ -0,0 +1,151 @@
	+//===- RDFLiveness.h --------------------------------------------- C++ --===//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+//
	+// Recalculate the liveness information given a data flow graph.
	+// This includes block live-ins and kill flags.
	+
	+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
	+#define LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
	+
	+#include "RDFGraph.h"
	+#include "RDFRegisters.h"
	+#include "llvm/ADT/DenseMap.h"
	+#include "llvm/MC/LaneBitmask.h"
	+#include <map>
	+#include <set>
	+#include <utility>
	+
	+namespace llvm {
	+
	+class MachineBasicBlock;
	+class MachineDominanceFrontier;
	+class MachineDominatorTree;
	+class MachineRegisterInfo;
	+class TargetRegisterInfo;
	+
	+namespace rdf {
	+
	+ struct Liveness {
	+ public:
	+ // This is really a std::map, except that it provides a non-trivial
	+ // default constructor to the element accessed via [].
	+ struct LiveMapType {
	+ LiveMapType(const PhysicalRegisterInfo &pri) : Empty(pri) {}
	+
	+ RegisterAggr &operator[] (MachineBasicBlock *B) {
	+ return Map.emplace(B, Empty).first->second;
	+ }
	+
	+ private:
	+ RegisterAggr Empty;
	+ std::map<MachineBasicBlock*,RegisterAggr> Map;
	+ };
	+
	+ using NodeRef = std::pair<NodeId, LaneBitmask>;
	+ using NodeRefSet = std::set<NodeRef>;
	+ // RegisterId in RefMap must be normalized.
	+ using RefMap = std::map<RegisterId, NodeRefSet>;
	+
	+ Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
	+ : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
	+ MDF(g.getDF()), LiveMap(g.getPRI()), Empty(), NoRegs(g.getPRI()) {}
	+
	+ NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
	+ bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
	+
	+ NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
	+ return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
	+ false, NoRegs);
	+ }
	+
	+ NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
	+ return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
	+ }
	+
	+ NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
	+ const RegisterAggr &DefRRs);
	+
	+ NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
	+ return getAllReachedUses(RefRR, DefA, NoRegs);
	+ }
	+
	+ std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
	+ NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
	+
	+ NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
	+ NodeAddr<InstrNode*> IA);
	+
	+ LiveMapType &getLiveMap() { return LiveMap; }
	+ const LiveMapType &getLiveMap() const { return LiveMap; }
	+
	+ const RefMap &getRealUses(NodeId P) const {
	+ auto F = RealUseMap.find(P);
	+ return F == RealUseMap.end() ? Empty : F->second;
	+ }
	+
	+ void computePhiInfo();
	+ void computeLiveIns();
	+ void resetLiveIns();
	+ void resetKills();
	+ void resetKills(MachineBasicBlock *B);
	+
	+ void trace(bool T) { Trace = T; }
	+
	+ private:
	+ const DataFlowGraph &DFG;
	+ const TargetRegisterInfo &TRI;
	+ const PhysicalRegisterInfo &PRI;
	+ const MachineDominatorTree &MDT;
	+ const MachineDominanceFrontier &MDF;
	+ LiveMapType LiveMap;
	+ const RefMap Empty;
	+ const RegisterAggr NoRegs;
	+ bool Trace = false;
	+
	+ // Cache of mapping from node ids (for RefNodes) to the containing
	+ // basic blocks. Not computing it each time for each node reduces
	+ // the liveness calculation time by a large fraction.
	+ using NodeBlockMap = DenseMap<NodeId, MachineBasicBlock *>;
	+ NodeBlockMap NBMap;
	+
	+ // Phi information:
	+ //
	+ // RealUseMap
	+ // map: NodeId -> (map: RegisterId -> NodeRefSet)
	+ // phi id -> (map: register -> set of reached non-phi uses)
	+ std::map<NodeId, RefMap> RealUseMap;
	+
	+ // Inverse iterated dominance frontier.
	+ std::map<MachineBasicBlock,std::set<MachineBasicBlock>> IIDF;
	+
	+ // Live on entry.
	+ std::map<MachineBasicBlock*,RefMap> PhiLON;
	+
	+ // Phi uses are considered to be located at the end of the block that
	+ // they are associated with. The reaching def of a phi use dominates the
	+ // block that the use corresponds to, but not the block that contains
	+ // the phi itself. To include these uses in the liveness propagation (up
	+ // the dominator tree), create a map: block -> set of uses live on exit.
	+ std::map<MachineBasicBlock*,RefMap> PhiLOX;
	+
	+ MachineBasicBlock *getBlockWithRef(NodeId RN) const;
	+ void traverse(MachineBasicBlock *B, RefMap &LiveIn);
	+ void emptify(RefMap &M);
	+
	+ std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
	+ NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
	+ unsigned Nest, unsigned MaxNest);
	+ };
	+
	+ raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P);
	+
	+} // end namespace rdf
	+
	+} // end namespace llvm
	+
	+#endif // LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H

	Property changes on: head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h
	===================================================================
	--- head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h (nonexistent)
	+++ head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h (revision 362609)
	@@ -0,0 +1,240 @@
	+//===- RDFRegisters.h -------------------------------------------- C++ --===//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+
	+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
	+#define LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
	+
	+#include "llvm/ADT/BitVector.h"
	+#include "llvm/ADT/STLExtras.h"
	+#include "llvm/CodeGen/TargetRegisterInfo.h"
	+#include "llvm/MC/LaneBitmask.h"
	+#include <cassert>
	+#include <cstdint>
	+#include <map>
	+#include <set>
	+#include <vector>
	+
	+namespace llvm {
	+
	+class MachineFunction;
	+class raw_ostream;
	+
	+namespace rdf {
	+
	+ using RegisterId = uint32_t;
	+
	+ // Template class for a map translating uint32_t into arbitrary types.
	+ // The map will act like an indexed set: upon insertion of a new object,
	+ // it will automatically assign a new index to it. Index of 0 is treated
	+ // as invalid and is never allocated.
	+ template <typename T, unsigned N = 32>
	+ struct IndexedSet {
	+ IndexedSet() { Map.reserve(N); }
	+
	+ T get(uint32_t Idx) const {
	+ // Index Idx corresponds to Map[Idx-1].
	+ assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
	+ return Map[Idx-1];
	+ }
	+
	+ uint32_t insert(T Val) {
	+ // Linear search.
	+ auto F = llvm::find(Map, Val);
	+ if (F != Map.end())
	+ return F - Map.begin() + 1;
	+ Map.push_back(Val);
	+ return Map.size(); // Return actual_index + 1.
	+ }
	+
	+ uint32_t find(T Val) const {
	+ auto F = llvm::find(Map, Val);
	+ assert(F != Map.end());
	+ return F - Map.begin() + 1;
	+ }
	+
	+ uint32_t size() const { return Map.size(); }
	+
	+ using const_iterator = typename std::vector<T>::const_iterator;
	+
	+ const_iterator begin() const { return Map.begin(); }
	+ const_iterator end() const { return Map.end(); }
	+
	+ private:
	+ std::vector<T> Map;
	+ };
	+
	+ struct RegisterRef {
	+ RegisterId Reg = 0;
	+ LaneBitmask Mask = LaneBitmask::getNone();
	+
	+ RegisterRef() = default;
	+ explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
	+ : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
	+
	+ operator bool() const {
	+ return Reg != 0 && Mask.any();
	+ }
	+
	+ bool operator== (const RegisterRef &RR) const {
	+ return Reg == RR.Reg && Mask == RR.Mask;
	+ }
	+
	+ bool operator!= (const RegisterRef &RR) const {
	+ return !operator==(RR);
	+ }
	+
	+ bool operator< (const RegisterRef &RR) const {
	+ return Reg < RR.Reg \|\| (Reg == RR.Reg && Mask < RR.Mask);
	+ }
	+ };
	+
	+
	+ struct PhysicalRegisterInfo {
	+ PhysicalRegisterInfo(const TargetRegisterInfo &tri,
	+ const MachineFunction &mf);
	+
	+ static bool isRegMaskId(RegisterId R) {
	+ return Register::isStackSlot(R);
	+ }
	+
	+ RegisterId getRegMaskId(const uint32_t *RM) const {
	+ return Register::index2StackSlot(RegMasks.find(RM));
	+ }
	+
	+ const uint32_t *getRegMaskBits(RegisterId R) const {
	+ return RegMasks.get(Register::stackSlot2Index(R));
	+ }
	+
	+ RegisterRef normalize(RegisterRef RR) const;
	+
	+ bool alias(RegisterRef RA, RegisterRef RB) const {
	+ if (!isRegMaskId(RA.Reg))
	+ return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
	+ return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
	+ }
	+
	+ std::set<RegisterId> getAliasSet(RegisterId Reg) const;
	+
	+ RegisterRef getRefForUnit(uint32_t U) const {
	+ return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask);
	+ }
	+
	+ const BitVector &getMaskUnits(RegisterId MaskId) const {
	+ return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
	+ }
	+
	+ RegisterRef mapTo(RegisterRef RR, unsigned R) const;
	+ const TargetRegisterInfo &getTRI() const { return TRI; }
	+
	+ private:
	+ struct RegInfo {
	+ const TargetRegisterClass *RegClass = nullptr;
	+ };
	+ struct UnitInfo {
	+ RegisterId Reg = 0;
	+ LaneBitmask Mask;
	+ };
	+ struct MaskInfo {
	+ BitVector Units;
	+ };
	+
	+ const TargetRegisterInfo &TRI;
	+ IndexedSet<const uint32_t*> RegMasks;
	+ std::vector<RegInfo> RegInfos;
	+ std::vector<UnitInfo> UnitInfos;
	+ std::vector<MaskInfo> MaskInfos;
	+
	+ bool aliasRR(RegisterRef RA, RegisterRef RB) const;
	+ bool aliasRM(RegisterRef RR, RegisterRef RM) const;
	+ bool aliasMM(RegisterRef RM, RegisterRef RN) const;
	+ };
	+
	+ struct RegisterAggr {
	+ RegisterAggr(const PhysicalRegisterInfo &pri)
	+ : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
	+ RegisterAggr(const RegisterAggr &RG) = default;
	+
	+ bool empty() const { return Units.none(); }
	+ bool hasAliasOf(RegisterRef RR) const;
	+ bool hasCoverOf(RegisterRef RR) const;
	+
	+ static bool isCoverOf(RegisterRef RA, RegisterRef RB,
	+ const PhysicalRegisterInfo &PRI) {
	+ return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
	+ }
	+
	+ RegisterAggr &insert(RegisterRef RR);
	+ RegisterAggr &insert(const RegisterAggr &RG);
	+ RegisterAggr &intersect(RegisterRef RR);
	+ RegisterAggr &intersect(const RegisterAggr &RG);
	+ RegisterAggr &clear(RegisterRef RR);
	+ RegisterAggr &clear(const RegisterAggr &RG);
	+
	+ RegisterRef intersectWith(RegisterRef RR) const;
	+ RegisterRef clearIn(RegisterRef RR) const;
	+ RegisterRef makeRegRef() const;
	+
	+ void print(raw_ostream &OS) const;
	+
	+ struct rr_iterator {
	+ using MapType = std::map<RegisterId, LaneBitmask>;
	+
	+ private:
	+ MapType Masks;
	+ MapType::iterator Pos;
	+ unsigned Index;
	+ const RegisterAggr *Owner;
	+
	+ public:
	+ rr_iterator(const RegisterAggr &RG, bool End);
	+
	+ RegisterRef operator*() const {
	+ return RegisterRef(Pos->first, Pos->second);
	+ }
	+
	+ rr_iterator &operator++() {
	+ ++Pos;
	+ ++Index;
	+ return *this;
	+ }
	+
	+ bool operator==(const rr_iterator &I) const {
	+ assert(Owner == I.Owner);
	+ (void)Owner;
	+ return Index == I.Index;
	+ }
	+
	+ bool operator!=(const rr_iterator &I) const {
	+ return !(*this == I);
	+ }
	+ };
	+
	+ rr_iterator rr_begin() const {
	+ return rr_iterator(*this, false);
	+ }
	+ rr_iterator rr_end() const {
	+ return rr_iterator(*this, true);
	+ }
	+
	+ private:
	+ BitVector Units;
	+ const PhysicalRegisterInfo &PRI;
	+ };
	+
	+ // Optionally print the lane mask, if it is not ~0.
	+ struct PrintLaneMaskOpt {
	+ PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
	+ LaneBitmask Mask;
	+ };
	+ raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
	+
	+} // end namespace rdf
	+
	+} // end namespace llvm
	+
	+#endif // LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H

	Property changes on: head/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td
	===================================================================
	--- head/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td (revision 362609)
	@@ -1,1175 +1,1178 @@
	//===- IntrinsicsPowerPC.td - Defines PowerPC intrinsics ---- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines all of the PowerPC-specific intrinsics.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Definitions for all PowerPC intrinsics.
	//

	// Non-altivec intrinsics.
	let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
	// dcba/dcbf/dcbi/dcbst/dcbt/dcbz/dcbzl(PPC970) instructions.
	def int_ppc_dcba : Intrinsic<[], [llvm_ptr_ty], []>;
	def int_ppc_dcbf : GCCBuiltin<"__builtin_dcbf">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_ppc_dcbi : Intrinsic<[], [llvm_ptr_ty], []>;
	def int_ppc_dcbst : Intrinsic<[], [llvm_ptr_ty], []>;
	def int_ppc_dcbt : Intrinsic<[], [llvm_ptr_ty],
	[IntrArgMemOnly, NoCapture<0>]>;
	def int_ppc_dcbtst: Intrinsic<[], [llvm_ptr_ty],
	[IntrArgMemOnly, NoCapture<0>]>;
	def int_ppc_dcbz : Intrinsic<[], [llvm_ptr_ty], []>;
	def int_ppc_dcbzl : Intrinsic<[], [llvm_ptr_ty], []>;

	// sync instruction (i.e. sync 0, a.k.a hwsync)
	def int_ppc_sync : Intrinsic<[], [], []>;
	// lwsync is sync 1
	def int_ppc_lwsync : Intrinsic<[], [], []>;

	// Intrinsics used to generate ctr-based loops. These should only be
	// generated by the PowerPC backend!
	// The branch intrinsic is marked as NoDuplicate because loop rotation will
	// attempt to duplicate it forming loops where a block reachable from one
	// instance of it can contain another.
	def int_ppc_mtctr : Intrinsic<[], [llvm_anyint_ty], []>;
	def int_ppc_is_decremented_ctr_nonzero :
	Intrinsic<[llvm_i1_ty], [], [IntrNoDuplicate]>;

	// Intrinsics for [double]word extended forms of divide instructions
	def int_ppc_divwe : GCCBuiltin<"__builtin_divwe">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_ppc_divweu : GCCBuiltin<"__builtin_divweu">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_ppc_divde : GCCBuiltin<"__builtin_divde">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
	[IntrNoMem]>;
	def int_ppc_divdeu : GCCBuiltin<"__builtin_divdeu">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
	[IntrNoMem]>;

	// Bit permute doubleword
	def int_ppc_bpermd : GCCBuiltin<"__builtin_bpermd">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
	[IntrNoMem]>;

	def int_ppc_truncf128_round_to_odd
	: GCCBuiltin<"__builtin_truncf128_round_to_odd">,
	Intrinsic <[llvm_double_ty], [llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_sqrtf128_round_to_odd
	: GCCBuiltin<"__builtin_sqrtf128_round_to_odd">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_addf128_round_to_odd
	: GCCBuiltin<"__builtin_addf128_round_to_odd">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_subf128_round_to_odd
	: GCCBuiltin<"__builtin_subf128_round_to_odd">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_mulf128_round_to_odd
	: GCCBuiltin<"__builtin_mulf128_round_to_odd">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_divf128_round_to_odd
	: GCCBuiltin<"__builtin_divf128_round_to_odd">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_fmaf128_round_to_odd
	: GCCBuiltin<"__builtin_fmaf128_round_to_odd">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty,llvm_f128_ty,llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_scalar_extract_expq
	: GCCBuiltin<"__builtin_vsx_scalar_extract_expq">,
	Intrinsic <[llvm_i64_ty], [llvm_f128_ty], [IntrNoMem]>;
	def int_ppc_scalar_insert_exp_qp
	: GCCBuiltin<"__builtin_vsx_scalar_insert_exp_qp">,
	Intrinsic <[llvm_f128_ty], [llvm_f128_ty, llvm_i64_ty], [IntrNoMem]>;

	}


	let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
	/// PowerPC_Vec_Intrinsic - Base class for all altivec intrinsics.
	class PowerPC_Vec_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
	list<LLVMType> param_types,
	list<IntrinsicProperty> properties>
	: GCCBuiltin<!strconcat("__builtin_altivec_", GCCIntSuffix)>,
	Intrinsic<ret_types, param_types, properties>;

	/// PowerPC_VSX_Intrinsic - Base class for all VSX intrinsics.
	class PowerPC_VSX_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
	list<LLVMType> param_types,
	list<IntrinsicProperty> properties>
	: GCCBuiltin<!strconcat("__builtin_vsx_", GCCIntSuffix)>,
	Intrinsic<ret_types, param_types, properties>;
	}

	//===----------------------------------------------------------------------===//
	// PowerPC Altivec Intrinsic Class Definitions.
	//

	/// PowerPC_Vec_FF_Intrinsic - A PowerPC intrinsic that takes one v4f32
	/// vector and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_FF_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;

	/// PowerPC_Vec_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f32
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_FFF_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;

	/// PowerPC_Vec_BBB_Intrinsic - A PowerPC intrinsic that takes two v16i8
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_BBB_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;

	/// PowerPC_Vec_HHH_Intrinsic - A PowerPC intrinsic that takes two v8i16
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_HHH_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;

	/// PowerPC_Vec_WWW_Intrinsic - A PowerPC intrinsic that takes two v4i32
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_WWW_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;

	/// PowerPC_Vec_DDD_Intrinsic - A PowerPC intrinsic that takes two v2i64
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_DDD_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;

	/// PowerPC_Vec_QQQ_Intrinsic - A PowerPC intrinsic that takes two v1i128
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_Vec_QQQ_Intrinsic<string GCCIntSuffix>
	: PowerPC_Vec_Intrinsic<GCCIntSuffix,
	[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
	[IntrNoMem]>;

	//===----------------------------------------------------------------------===//
	// PowerPC VSX Intrinsic Class Definitions.
	//

	/// PowerPC_VSX_Vec_DDD_Intrinsic - A PowerPC intrinsic that takes two v2f64
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_VSX_Vec_DDD_Intrinsic<string GCCIntSuffix>
	: PowerPC_VSX_Intrinsic<GCCIntSuffix,
	[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;

	/// PowerPC_VSX_Vec_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f32
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_VSX_Vec_FFF_Intrinsic<string GCCIntSuffix>
	: PowerPC_VSX_Intrinsic<GCCIntSuffix,
	[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;

	/// PowerPC_VSX_Sca_DDD_Intrinsic - A PowerPC intrinsic that takes two f64
	/// scalars and returns one. These intrinsics have no side effects.
	class PowerPC_VSX_Sca_DDD_Intrinsic<string GCCIntSuffix>
	: PowerPC_VSX_Intrinsic<GCCIntSuffix,
	[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
	[IntrNoMem]>;

	//===----------------------------------------------------------------------===//
	// PowerPC Altivec Intrinsic Definitions.

	let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
	// Data Stream Control.
	def int_ppc_altivec_dss : GCCBuiltin<"__builtin_altivec_dss">,
	Intrinsic<[], [llvm_i32_ty], []>;
	def int_ppc_altivec_dssall : GCCBuiltin<"__builtin_altivec_dssall">,
	Intrinsic<[], [], []>;
	def int_ppc_altivec_dst : GCCBuiltin<"__builtin_altivec_dst">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
	[]>;
	def int_ppc_altivec_dstt : GCCBuiltin<"__builtin_altivec_dstt">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
	[]>;
	def int_ppc_altivec_dstst : GCCBuiltin<"__builtin_altivec_dstst">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
	[]>;
	def int_ppc_altivec_dststt : GCCBuiltin<"__builtin_altivec_dststt">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
	[]>;

	// VSCR access.
	def int_ppc_altivec_mfvscr : GCCBuiltin<"__builtin_altivec_mfvscr">,
	Intrinsic<[llvm_v8i16_ty], [], [IntrReadMem]>;
	def int_ppc_altivec_mtvscr : GCCBuiltin<"__builtin_altivec_mtvscr">,
	Intrinsic<[], [llvm_v4i32_ty], []>;


	// Loads. These don't map directly to GCC builtins because they represent the
	// source address with a single pointer.
	def int_ppc_altivec_lvx :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_altivec_lvxl :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_altivec_lvebx :
	Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_altivec_lvehx :
	Intrinsic<[llvm_v8i16_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_altivec_lvewx :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;

	// Stores. These don't map directly to GCC builtins because they represent the
	// source address with a single pointer.
	def int_ppc_altivec_stvx :
	Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_altivec_stvxl :
	Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_altivec_stvebx :
	Intrinsic<[], [llvm_v16i8_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_altivec_stvehx :
	Intrinsic<[], [llvm_v8i16_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_altivec_stvewx :
	Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;

	// Comparisons setting a vector.
	def int_ppc_altivec_vcmpbfp : GCCBuiltin<"__builtin_altivec_vcmpbfp">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpeqfp : GCCBuiltin<"__builtin_altivec_vcmpeqfp">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgefp : GCCBuiltin<"__builtin_altivec_vcmpgefp">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtfp : GCCBuiltin<"__builtin_altivec_vcmpgtfp">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequd : GCCBuiltin<"__builtin_altivec_vcmpequd">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsd : GCCBuiltin<"__builtin_altivec_vcmpgtsd">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtud : GCCBuiltin<"__builtin_altivec_vcmpgtud">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequw : GCCBuiltin<"__builtin_altivec_vcmpequw">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsw : GCCBuiltin<"__builtin_altivec_vcmpgtsw">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtuw : GCCBuiltin<"__builtin_altivec_vcmpgtuw">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnew : GCCBuiltin<"__builtin_altivec_vcmpnew">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnezw : GCCBuiltin<"__builtin_altivec_vcmpnezw">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequh : GCCBuiltin<"__builtin_altivec_vcmpequh">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsh : GCCBuiltin<"__builtin_altivec_vcmpgtsh">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtuh : GCCBuiltin<"__builtin_altivec_vcmpgtuh">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpneh : GCCBuiltin<"__builtin_altivec_vcmpneh">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnezh : GCCBuiltin<"__builtin_altivec_vcmpnezh">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequb : GCCBuiltin<"__builtin_altivec_vcmpequb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsb : GCCBuiltin<"__builtin_altivec_vcmpgtsb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtub : GCCBuiltin<"__builtin_altivec_vcmpgtub">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpneb : GCCBuiltin<"__builtin_altivec_vcmpneb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnezb : GCCBuiltin<"__builtin_altivec_vcmpnezb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;

	// Predicate Comparisons. The first operand specifies interpretation of CR6.
	def int_ppc_altivec_vcmpbfp_p : GCCBuiltin<"__builtin_altivec_vcmpbfp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpeqfp_p : GCCBuiltin<"__builtin_altivec_vcmpeqfp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgefp_p : GCCBuiltin<"__builtin_altivec_vcmpgefp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtfp_p : GCCBuiltin<"__builtin_altivec_vcmpgtfp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequd_p : GCCBuiltin<"__builtin_altivec_vcmpequd_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2i64_ty,llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsd_p : GCCBuiltin<"__builtin_altivec_vcmpgtsd_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2i64_ty,llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtud_p : GCCBuiltin<"__builtin_altivec_vcmpgtud_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2i64_ty,llvm_v2i64_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequw_p : GCCBuiltin<"__builtin_altivec_vcmpequw_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsw_p : GCCBuiltin<"__builtin_altivec_vcmpgtsw_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtuw_p : GCCBuiltin<"__builtin_altivec_vcmpgtuw_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnew_p : GCCBuiltin<"__builtin_altivec_vcmpnew_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnezw_p : GCCBuiltin<"__builtin_altivec_vcmpnezw_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4i32_ty,llvm_v4i32_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequh_p : GCCBuiltin<"__builtin_altivec_vcmpequh_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsh_p : GCCBuiltin<"__builtin_altivec_vcmpgtsh_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtuh_p : GCCBuiltin<"__builtin_altivec_vcmpgtuh_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpneh_p : GCCBuiltin<"__builtin_altivec_vcmpneh_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnezh_p : GCCBuiltin<"__builtin_altivec_vcmpnezh_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v8i16_ty,llvm_v8i16_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vcmpequb_p : GCCBuiltin<"__builtin_altivec_vcmpequb_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtsb_p : GCCBuiltin<"__builtin_altivec_vcmpgtsb_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpgtub_p : GCCBuiltin<"__builtin_altivec_vcmpgtub_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpneb_p : GCCBuiltin<"__builtin_altivec_vcmpneb_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vcmpnezb_p : GCCBuiltin<"__builtin_altivec_vcmpnezb_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v16i8_ty,llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vclzlsbb : GCCBuiltin<"__builtin_altivec_vclzlsbb">,
	Intrinsic<[llvm_i32_ty],[llvm_v16i8_ty],[IntrNoMem]>;
	def int_ppc_altivec_vctzlsbb : GCCBuiltin<"__builtin_altivec_vctzlsbb">,
	Intrinsic<[llvm_i32_ty],[llvm_v16i8_ty],[IntrNoMem]>;
	def int_ppc_altivec_vprtybw : GCCBuiltin<"__builtin_altivec_vprtybw">,
	Intrinsic<[llvm_v4i32_ty],[llvm_v4i32_ty],[IntrNoMem]>;
	def int_ppc_altivec_vprtybd : GCCBuiltin<"__builtin_altivec_vprtybd">,
	Intrinsic<[llvm_v2i64_ty],[llvm_v2i64_ty],[IntrNoMem]>;
	def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">,
	Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;

	}

	// Vector average.
	def int_ppc_altivec_vavgsb : PowerPC_Vec_BBB_Intrinsic<"vavgsb">;
	def int_ppc_altivec_vavgsh : PowerPC_Vec_HHH_Intrinsic<"vavgsh">;
	def int_ppc_altivec_vavgsw : PowerPC_Vec_WWW_Intrinsic<"vavgsw">;
	def int_ppc_altivec_vavgub : PowerPC_Vec_BBB_Intrinsic<"vavgub">;
	def int_ppc_altivec_vavguh : PowerPC_Vec_HHH_Intrinsic<"vavguh">;
	def int_ppc_altivec_vavguw : PowerPC_Vec_WWW_Intrinsic<"vavguw">;

	// Vector maximum.
	def int_ppc_altivec_vmaxfp : PowerPC_Vec_FFF_Intrinsic<"vmaxfp">;
	def int_ppc_altivec_vmaxsb : PowerPC_Vec_BBB_Intrinsic<"vmaxsb">;
	def int_ppc_altivec_vmaxsh : PowerPC_Vec_HHH_Intrinsic<"vmaxsh">;
	def int_ppc_altivec_vmaxsw : PowerPC_Vec_WWW_Intrinsic<"vmaxsw">;
	def int_ppc_altivec_vmaxsd : PowerPC_Vec_DDD_Intrinsic<"vmaxsd">;
	def int_ppc_altivec_vmaxub : PowerPC_Vec_BBB_Intrinsic<"vmaxub">;
	def int_ppc_altivec_vmaxuh : PowerPC_Vec_HHH_Intrinsic<"vmaxuh">;
	def int_ppc_altivec_vmaxuw : PowerPC_Vec_WWW_Intrinsic<"vmaxuw">;
	def int_ppc_altivec_vmaxud : PowerPC_Vec_DDD_Intrinsic<"vmaxud">;

	// Vector minimum.
	def int_ppc_altivec_vminfp : PowerPC_Vec_FFF_Intrinsic<"vminfp">;
	def int_ppc_altivec_vminsb : PowerPC_Vec_BBB_Intrinsic<"vminsb">;
	def int_ppc_altivec_vminsh : PowerPC_Vec_HHH_Intrinsic<"vminsh">;
	def int_ppc_altivec_vminsw : PowerPC_Vec_WWW_Intrinsic<"vminsw">;
	def int_ppc_altivec_vminsd : PowerPC_Vec_DDD_Intrinsic<"vminsd">;
	def int_ppc_altivec_vminub : PowerPC_Vec_BBB_Intrinsic<"vminub">;
	def int_ppc_altivec_vminuh : PowerPC_Vec_HHH_Intrinsic<"vminuh">;
	def int_ppc_altivec_vminuw : PowerPC_Vec_WWW_Intrinsic<"vminuw">;
	def int_ppc_altivec_vminud : PowerPC_Vec_DDD_Intrinsic<"vminud">;

	// Saturating adds.
	def int_ppc_altivec_vaddubs : PowerPC_Vec_BBB_Intrinsic<"vaddubs">;
	def int_ppc_altivec_vaddsbs : PowerPC_Vec_BBB_Intrinsic<"vaddsbs">;
	def int_ppc_altivec_vadduhs : PowerPC_Vec_HHH_Intrinsic<"vadduhs">;
	def int_ppc_altivec_vaddshs : PowerPC_Vec_HHH_Intrinsic<"vaddshs">;
	def int_ppc_altivec_vadduws : PowerPC_Vec_WWW_Intrinsic<"vadduws">;
	def int_ppc_altivec_vaddsws : PowerPC_Vec_WWW_Intrinsic<"vaddsws">;
	def int_ppc_altivec_vaddcuw : PowerPC_Vec_WWW_Intrinsic<"vaddcuw">;
	def int_ppc_altivec_vaddcuq : PowerPC_Vec_QQQ_Intrinsic<"vaddcuq">;

	// Saturating subs.
	def int_ppc_altivec_vsububs : PowerPC_Vec_BBB_Intrinsic<"vsububs">;
	def int_ppc_altivec_vsubsbs : PowerPC_Vec_BBB_Intrinsic<"vsubsbs">;
	def int_ppc_altivec_vsubuhs : PowerPC_Vec_HHH_Intrinsic<"vsubuhs">;
	def int_ppc_altivec_vsubshs : PowerPC_Vec_HHH_Intrinsic<"vsubshs">;
	def int_ppc_altivec_vsubuws : PowerPC_Vec_WWW_Intrinsic<"vsubuws">;
	def int_ppc_altivec_vsubsws : PowerPC_Vec_WWW_Intrinsic<"vsubsws">;
	def int_ppc_altivec_vsubcuw : PowerPC_Vec_WWW_Intrinsic<"vsubcuw">;
	def int_ppc_altivec_vsubcuq : PowerPC_Vec_QQQ_Intrinsic<"vsubcuq">;

	let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
	// Saturating multiply-adds.
	def int_ppc_altivec_vmhaddshs : GCCBuiltin<"__builtin_altivec_vmhaddshs">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
	def int_ppc_altivec_vmhraddshs : GCCBuiltin<"__builtin_altivec_vmhraddshs">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;

	def int_ppc_altivec_vmaddfp : GCCBuiltin<"__builtin_altivec_vmaddfp">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vnmsubfp : GCCBuiltin<"__builtin_altivec_vnmsubfp">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;

	// Vector Multiply Sum Intructions.
	def int_ppc_altivec_vmsummbm : GCCBuiltin<"__builtin_altivec_vmsummbm">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vmsumshm : GCCBuiltin<"__builtin_altivec_vmsumshm">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vmsumshs : GCCBuiltin<"__builtin_altivec_vmsumshs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vmsumubm : GCCBuiltin<"__builtin_altivec_vmsumubm">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vmsumuhm : GCCBuiltin<"__builtin_altivec_vmsumuhm">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	+ def int_ppc_altivec_vmsumudm : GCCBuiltin<"__builtin_altivec_vmsumudm">,
	+ Intrinsic<[llvm_v1i128_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	+ llvm_v1i128_ty], [IntrNoMem]>;
	def int_ppc_altivec_vmsumuhs : GCCBuiltin<"__builtin_altivec_vmsumuhs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v4i32_ty], [IntrNoMem]>;

	// Vector Multiply Intructions.
	def int_ppc_altivec_vmulesb : GCCBuiltin<"__builtin_altivec_vmulesb">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmulesh : GCCBuiltin<"__builtin_altivec_vmulesh">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmulesw : GCCBuiltin<"__builtin_altivec_vmulesw">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmuleub : GCCBuiltin<"__builtin_altivec_vmuleub">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmuleuh : GCCBuiltin<"__builtin_altivec_vmuleuh">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmuleuw : GCCBuiltin<"__builtin_altivec_vmuleuw">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;

	def int_ppc_altivec_vmulosb : GCCBuiltin<"__builtin_altivec_vmulosb">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmulosh : GCCBuiltin<"__builtin_altivec_vmulosh">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmulosw : GCCBuiltin<"__builtin_altivec_vmulosw">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmuloub : GCCBuiltin<"__builtin_altivec_vmuloub">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmulouh : GCCBuiltin<"__builtin_altivec_vmulouh">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vmulouw : GCCBuiltin<"__builtin_altivec_vmulouw">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;

	// Vector Sum Intructions.
	def int_ppc_altivec_vsumsws : GCCBuiltin<"__builtin_altivec_vsumsws">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vsum2sws : GCCBuiltin<"__builtin_altivec_vsum2sws">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vsum4sbs : GCCBuiltin<"__builtin_altivec_vsum4sbs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vsum4shs : GCCBuiltin<"__builtin_altivec_vsum4shs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vsum4ubs : GCCBuiltin<"__builtin_altivec_vsum4ubs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v4i32_ty],
	[IntrNoMem]>;

	// Other multiplies.
	def int_ppc_altivec_vmladduhm : GCCBuiltin<"__builtin_altivec_vmladduhm">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;

	// Packs.
	def int_ppc_altivec_vpkpx : GCCBuiltin<"__builtin_altivec_vpkpx">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vpkshss : GCCBuiltin<"__builtin_altivec_vpkshss">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vpkshus : GCCBuiltin<"__builtin_altivec_vpkshus">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vpksdss : GCCBuiltin<"__builtin_altivec_vpksdss">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vpksdus : GCCBuiltin<"__builtin_altivec_vpksdus">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	// vpkuhum is lowered to a shuffle.
	def int_ppc_altivec_vpkuhus : GCCBuiltin<"__builtin_altivec_vpkuhus">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	// vpkuwum is lowered to a shuffle.
	def int_ppc_altivec_vpkuwus : GCCBuiltin<"__builtin_altivec_vpkuwus">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	// vpkudum is lowered to a shuffle.
	def int_ppc_altivec_vpkudus : GCCBuiltin<"__builtin_altivec_vpkudus">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;

	// Unpacks.
	def int_ppc_altivec_vupkhpx : GCCBuiltin<"__builtin_altivec_vupkhpx">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupkhsb : GCCBuiltin<"__builtin_altivec_vupkhsb">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupkhsh : GCCBuiltin<"__builtin_altivec_vupkhsh">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupkhsw : GCCBuiltin<"__builtin_altivec_vupkhsw">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupklpx : GCCBuiltin<"__builtin_altivec_vupklpx">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupklsb : GCCBuiltin<"__builtin_altivec_vupklsb">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupklsh : GCCBuiltin<"__builtin_altivec_vupklsh">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_ppc_altivec_vupklsw : GCCBuiltin<"__builtin_altivec_vupklsw">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;


	// FP <-> integer conversion.
	def int_ppc_altivec_vcfsx : GCCBuiltin<"__builtin_altivec_vcfsx">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i32_ty],
	[IntrNoMem, ImmArg<1>]>;
	def int_ppc_altivec_vcfux : GCCBuiltin<"__builtin_altivec_vcfux">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i32_ty],
	[IntrNoMem, ImmArg<1>]>;
	def int_ppc_altivec_vctsxs : GCCBuiltin<"__builtin_altivec_vctsxs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
	[IntrNoMem, ImmArg<1>]>;
	def int_ppc_altivec_vctuxs : GCCBuiltin<"__builtin_altivec_vctuxs">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
	[IntrNoMem, ImmArg<1>]>;

	def int_ppc_altivec_vrfim : GCCBuiltin<"__builtin_altivec_vrfim">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vrfin : GCCBuiltin<"__builtin_altivec_vrfin">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vrfip : GCCBuiltin<"__builtin_altivec_vrfip">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vrfiz : GCCBuiltin<"__builtin_altivec_vrfiz">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;

	// Add Extended Quadword
	def int_ppc_altivec_vaddeuqm : GCCBuiltin<"__builtin_altivec_vaddeuqm">,
	Intrinsic<[llvm_v1i128_ty],
	[llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vaddecuq : GCCBuiltin<"__builtin_altivec_vaddecuq">,
	Intrinsic<[llvm_v1i128_ty],
	[llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
	[IntrNoMem]>;

	// Sub Extended Quadword
	def int_ppc_altivec_vsubeuqm : GCCBuiltin<"__builtin_altivec_vsubeuqm">,
	Intrinsic<[llvm_v1i128_ty],
	[llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vsubecuq : GCCBuiltin<"__builtin_altivec_vsubecuq">,
	Intrinsic<[llvm_v1i128_ty],
	[llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
	[IntrNoMem]>;
	}

	def int_ppc_altivec_vsl : PowerPC_Vec_WWW_Intrinsic<"vsl">;
	def int_ppc_altivec_vslo : PowerPC_Vec_WWW_Intrinsic<"vslo">;

	def int_ppc_altivec_vslb : PowerPC_Vec_BBB_Intrinsic<"vslb">;
	def int_ppc_altivec_vslv : PowerPC_Vec_BBB_Intrinsic<"vslv">;
	def int_ppc_altivec_vsrv : PowerPC_Vec_BBB_Intrinsic<"vsrv">;
	def int_ppc_altivec_vslh : PowerPC_Vec_HHH_Intrinsic<"vslh">;
	def int_ppc_altivec_vslw : PowerPC_Vec_WWW_Intrinsic<"vslw">;

	// Right Shifts.
	def int_ppc_altivec_vsr : PowerPC_Vec_WWW_Intrinsic<"vsr">;
	def int_ppc_altivec_vsro : PowerPC_Vec_WWW_Intrinsic<"vsro">;

	def int_ppc_altivec_vsrb : PowerPC_Vec_BBB_Intrinsic<"vsrb">;
	def int_ppc_altivec_vsrh : PowerPC_Vec_HHH_Intrinsic<"vsrh">;
	def int_ppc_altivec_vsrw : PowerPC_Vec_WWW_Intrinsic<"vsrw">;
	def int_ppc_altivec_vsrab : PowerPC_Vec_BBB_Intrinsic<"vsrab">;
	def int_ppc_altivec_vsrah : PowerPC_Vec_HHH_Intrinsic<"vsrah">;
	def int_ppc_altivec_vsraw : PowerPC_Vec_WWW_Intrinsic<"vsraw">;

	// Rotates.
	def int_ppc_altivec_vrlb : PowerPC_Vec_BBB_Intrinsic<"vrlb">;
	def int_ppc_altivec_vrlh : PowerPC_Vec_HHH_Intrinsic<"vrlh">;
	def int_ppc_altivec_vrlw : PowerPC_Vec_WWW_Intrinsic<"vrlw">;
	def int_ppc_altivec_vrld : PowerPC_Vec_DDD_Intrinsic<"vrld">;

	let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
	// Miscellaneous.
	def int_ppc_altivec_lvsl :
	Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrNoMem]>;
	def int_ppc_altivec_lvsr :
	Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrNoMem]>;

	def int_ppc_altivec_vperm : GCCBuiltin<"__builtin_altivec_vperm_4si">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_v16i8_ty], [IntrNoMem]>;
	def int_ppc_altivec_vsel : GCCBuiltin<"__builtin_altivec_vsel_4si">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vgbbd : GCCBuiltin<"__builtin_altivec_vgbbd">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_ppc_altivec_vbpermq : GCCBuiltin<"__builtin_altivec_vbpermq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	}

	def int_ppc_altivec_vexptefp : PowerPC_Vec_FF_Intrinsic<"vexptefp">;
	def int_ppc_altivec_vlogefp : PowerPC_Vec_FF_Intrinsic<"vlogefp">;
	def int_ppc_altivec_vrefp : PowerPC_Vec_FF_Intrinsic<"vrefp">;
	def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">;

	// Power8 Intrinsics
	// Crypto
	let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
	def int_ppc_altivec_crypto_vsbox :
	GCCBuiltin<"__builtin_altivec_crypto_vsbox">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
	def int_ppc_altivec_crypto_vpermxor :
	GCCBuiltin<"__builtin_altivec_crypto_vpermxor">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;

	def int_ppc_altivec_crypto_vshasigmad :
	GCCBuiltin<"__builtin_altivec_crypto_vshasigmad">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
	def int_ppc_altivec_crypto_vshasigmaw :
	GCCBuiltin<"__builtin_altivec_crypto_vshasigmaw">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
	}
	def int_ppc_altivec_crypto_vcipher :
	PowerPC_Vec_DDD_Intrinsic<"crypto_vcipher">;
	def int_ppc_altivec_crypto_vcipherlast :
	PowerPC_Vec_DDD_Intrinsic<"crypto_vcipherlast">;
	def int_ppc_altivec_crypto_vncipher :
	PowerPC_Vec_DDD_Intrinsic<"crypto_vncipher">;
	def int_ppc_altivec_crypto_vncipherlast :
	PowerPC_Vec_DDD_Intrinsic<"crypto_vncipherlast">;
	def int_ppc_altivec_crypto_vpmsumb :
	PowerPC_Vec_BBB_Intrinsic<"crypto_vpmsumb">;
	def int_ppc_altivec_crypto_vpmsumh :
	PowerPC_Vec_HHH_Intrinsic<"crypto_vpmsumh">;
	def int_ppc_altivec_crypto_vpmsumw :
	PowerPC_Vec_WWW_Intrinsic<"crypto_vpmsumw">;
	def int_ppc_altivec_crypto_vpmsumd :
	PowerPC_Vec_DDD_Intrinsic<"crypto_vpmsumd">;

	// Absolute Difference intrinsics
	def int_ppc_altivec_vabsdub : PowerPC_Vec_BBB_Intrinsic<"vabsdub">;
	def int_ppc_altivec_vabsduh : PowerPC_Vec_HHH_Intrinsic<"vabsduh">;
	def int_ppc_altivec_vabsduw : PowerPC_Vec_WWW_Intrinsic<"vabsduw">;

	// Vector rotates
	def int_ppc_altivec_vrlwnm :
	PowerPC_Vec_Intrinsic<"vrlwnm", [llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_altivec_vrlwmi :
	PowerPC_Vec_Intrinsic<"vrlwmi", [llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_ppc_altivec_vrldnm :
	PowerPC_Vec_Intrinsic<"vrldnm", [llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
	def int_ppc_altivec_vrldmi :
	PowerPC_Vec_Intrinsic<"vrldmi", [llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;

	//===----------------------------------------------------------------------===//
	// PowerPC VSX Intrinsic Definitions.

	let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".

	// Vector load.
	def int_ppc_vsx_lxvw4x :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_vsx_lxvd2x :
	Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_vsx_lxvw4x_be :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_vsx_lxvd2x_be :
	Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_ppc_vsx_lxvl :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem,
	IntrArgMemOnly]>;
	def int_ppc_vsx_lxvll :
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem,
	IntrArgMemOnly]>;
	def int_ppc_vsx_stxvl :
	Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_vsx_stxvll :
	Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
	[IntrWriteMem, IntrArgMemOnly]>;

	// Vector store.
	def int_ppc_vsx_stxvw4x : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_vsx_stxvd2x : Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_vsx_stxvw4x_be : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	def int_ppc_vsx_stxvd2x_be : Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;
	// Vector and scalar maximum.
	def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
	def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
	def int_ppc_vsx_xsmaxdp : PowerPC_VSX_Sca_DDD_Intrinsic<"xsmaxdp">;

	// Vector and scalar minimum.
	def int_ppc_vsx_xvmindp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmindp">;
	def int_ppc_vsx_xvminsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvminsp">;
	def int_ppc_vsx_xsmindp : PowerPC_VSX_Sca_DDD_Intrinsic<"xsmindp">;

	// Vector divide.
	def int_ppc_vsx_xvdivdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvdivdp">;
	def int_ppc_vsx_xvdivsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvdivsp">;

	// Vector round-to-infinity (ceil)
	def int_ppc_vsx_xvrspip :
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvrdpip :
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;

	// Vector reciprocal estimate
	def int_ppc_vsx_xvresp : GCCBuiltin<"__builtin_vsx_xvresp">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvredp : GCCBuiltin<"__builtin_vsx_xvredp">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;

	// Vector rsqrte
	def int_ppc_vsx_xvrsqrtesp : GCCBuiltin<"__builtin_vsx_xvrsqrtesp">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvrsqrtedp : GCCBuiltin<"__builtin_vsx_xvrsqrtedp">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;

	// Vector compare
	def int_ppc_vsx_xvcmpeqdp :
	PowerPC_VSX_Intrinsic<"xvcmpeqdp", [llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcmpeqdp_p : GCCBuiltin<"__builtin_vsx_xvcmpeqdp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2f64_ty,llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_ppc_vsx_xvcmpeqsp :
	PowerPC_VSX_Intrinsic<"xvcmpeqsp", [llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcmpeqsp_p : GCCBuiltin<"__builtin_vsx_xvcmpeqsp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_vsx_xvcmpgedp :
	PowerPC_VSX_Intrinsic<"xvcmpgedp", [llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcmpgedp_p : GCCBuiltin<"__builtin_vsx_xvcmpgedp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2f64_ty,llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_ppc_vsx_xvcmpgesp :
	PowerPC_VSX_Intrinsic<"xvcmpgesp", [llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcmpgesp_p : GCCBuiltin<"__builtin_vsx_xvcmpgesp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_vsx_xvcmpgtdp :
	PowerPC_VSX_Intrinsic<"xvcmpgtdp", [llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcmpgtdp_p : GCCBuiltin<"__builtin_vsx_xvcmpgtdp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v2f64_ty,llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_ppc_vsx_xvcmpgtsp :
	PowerPC_VSX_Intrinsic<"xvcmpgtsp", [llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcmpgtsp_p : GCCBuiltin<"__builtin_vsx_xvcmpgtsp_p">,
	Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_ppc_vsx_xxleqv :
	PowerPC_VSX_Intrinsic<"xxleqv", [llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xviexpdp :
	PowerPC_VSX_Intrinsic<"xviexpdp",[llvm_v2f64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty],[IntrNoMem]>;
	def int_ppc_vsx_xviexpsp :
	PowerPC_VSX_Intrinsic<"xviexpsp",[llvm_v4f32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty],[IntrNoMem]>;
	def int_ppc_vsx_xvcvdpsxws :
	PowerPC_VSX_Intrinsic<"xvcvdpsxws", [llvm_v4i32_ty],
	[llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvdpuxws :
	PowerPC_VSX_Intrinsic<"xvcvdpuxws", [llvm_v4i32_ty],
	[llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvsxwdp :
	PowerPC_VSX_Intrinsic<"xvcvsxwdp", [llvm_v2f64_ty],
	[llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvuxwdp :
	PowerPC_VSX_Intrinsic<"xvcvuxwdp", [llvm_v2f64_ty],
	[llvm_v4i32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvspdp :
	PowerPC_VSX_Intrinsic<"xvcvspdp", [llvm_v2f64_ty],
	[llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvsxdsp :
	PowerPC_VSX_Intrinsic<"xvcvsxdsp", [llvm_v4f32_ty],
	[llvm_v2i64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvuxdsp :
	PowerPC_VSX_Intrinsic<"xvcvuxdsp", [llvm_v4f32_ty],
	[llvm_v2i64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvdpsp :
	PowerPC_VSX_Intrinsic<"xvcvdpsp", [llvm_v4f32_ty],
	[llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvcvsphp :
	PowerPC_VSX_Intrinsic<"xvcvsphp", [llvm_v4f32_ty],
	[llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvxexpdp :
	PowerPC_VSX_Intrinsic<"xvxexpdp", [llvm_v2i64_ty],
	[llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvxexpsp :
	PowerPC_VSX_Intrinsic<"xvxexpsp", [llvm_v4i32_ty],
	[llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvxsigdp :
	PowerPC_VSX_Intrinsic<"xvxsigdp", [llvm_v2i64_ty],
	[llvm_v2f64_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvxsigsp :
	PowerPC_VSX_Intrinsic<"xvxsigsp", [llvm_v4i32_ty],
	[llvm_v4f32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xvtstdcdp :
	PowerPC_VSX_Intrinsic<"xvtstdcdp", [llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
	def int_ppc_vsx_xvtstdcsp :
	PowerPC_VSX_Intrinsic<"xvtstdcsp", [llvm_v4i32_ty],
	[llvm_v4f32_ty,llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
	def int_ppc_vsx_xvcvhpsp :
	PowerPC_VSX_Intrinsic<"xvcvhpsp", [llvm_v4f32_ty],
	[llvm_v8i16_ty],[IntrNoMem]>;
	def int_ppc_vsx_xxextractuw :
	PowerPC_VSX_Intrinsic<"xxextractuw",[llvm_v2i64_ty],
	[llvm_v2i64_ty,llvm_i32_ty], [IntrNoMem]>;
	def int_ppc_vsx_xxinsertw :
	PowerPC_VSX_Intrinsic<"xxinsertw",[llvm_v4i32_ty],
	[llvm_v4i32_ty,llvm_v2i64_ty,llvm_i32_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// PowerPC QPX Intrinsics.
	//

	let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
	/// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics.
	class PowerPC_QPX_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
	list<LLVMType> param_types,
	list<IntrinsicProperty> properties>
	: GCCBuiltin<!strconcat("__builtin_qpx_", GCCIntSuffix)>,
	Intrinsic<ret_types, param_types, properties>;
	}

	//===----------------------------------------------------------------------===//
	// PowerPC QPX Intrinsic Class Definitions.
	//

	/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64
	/// vector and returns one. These intrinsics have no side effects.
	class PowerPC_QPX_FF_Intrinsic<string GCCIntSuffix>
	: PowerPC_QPX_Intrinsic<GCCIntSuffix,
	[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;

	/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_QPX_FFF_Intrinsic<string GCCIntSuffix>
	: PowerPC_QPX_Intrinsic<GCCIntSuffix,
	[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;

	/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64
	/// vectors and returns one. These intrinsics have no side effects.
	class PowerPC_QPX_FFFF_Intrinsic<string GCCIntSuffix>
	: PowerPC_QPX_Intrinsic<GCCIntSuffix,
	[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;

	/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer
	/// and returns a v4f64.
	class PowerPC_QPX_Load_Intrinsic<string GCCIntSuffix>
	: PowerPC_QPX_Intrinsic<GCCIntSuffix,
	[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;

	/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer
	/// and returns a v4f64 permutation.
	class PowerPC_QPX_LoadPerm_Intrinsic<string GCCIntSuffix>
	: PowerPC_QPX_Intrinsic<GCCIntSuffix,
	[llvm_v4f64_ty], [llvm_ptr_ty], [IntrNoMem]>;

	/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer
	/// and stores a v4f64.
	class PowerPC_QPX_Store_Intrinsic<string GCCIntSuffix>
	: PowerPC_QPX_Intrinsic<GCCIntSuffix,
	[], [llvm_v4f64_ty, llvm_ptr_ty],
	[IntrWriteMem, IntrArgMemOnly]>;

	//===----------------------------------------------------------------------===//
	// PowerPC QPX Intrinsic Definitions.

	let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
	// Add Instructions
	def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">;
	def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">;
	def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">;
	def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">;

	// Estimate Instructions
	def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">;
	def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">;
	def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">;
	def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">;

	// Multiply Instructions
	def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">;
	def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">;
	def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">;
	def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">;

	// Multiply-add instructions
	def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">;
	def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">;
	def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">;
	def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">;
	def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">;
	def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">;
	def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">;
	def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">;
	def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">;
	def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">;
	def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">;
	def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">;
	def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">;
	def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">;
	def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">;
	def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">;

	// Select Instruction
	def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">;

	// Permute Instruction
	def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">;

	// Convert and Round Instructions
	def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">;
	def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">;
	def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">;
	def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">;
	def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">;
	def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">;
	def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">;
	def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">;
	def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">;
	def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">;
	def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">;
	def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">;
	def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">;
	def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">;
	def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">;
	def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">;
	def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">;

	// Move Instructions
	def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">;
	def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">;
	def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">;
	def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">;

	// Compare Instructions
	def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">;
	def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">;
	def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">;
	def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">;

	// Load instructions
	def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">;
	def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">;
	def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">;
	def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">;

	def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">;
	def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">;
	def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">;
	def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">;
	def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">;
	def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">;
	def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">;
	def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">;

	def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">;
	def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">;
	def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">;
	def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">;

	// Store instructions
	def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">;
	def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">;
	def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">;
	def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">;

	def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">;
	def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">;
	def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">;
	def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">;
	def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">;
	def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">;

	// Logical and permutation formation
	def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical",
	[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci",
	[llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// PowerPC HTM Intrinsic Definitions.

	let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".

	def int_ppc_tbegin : GCCBuiltin<"__builtin_tbegin">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<0>]>;
	def int_ppc_tend : GCCBuiltin<"__builtin_tend">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<0>]>;

	def int_ppc_tabort : GCCBuiltin<"__builtin_tabort">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
	def int_ppc_tabortwc : GCCBuiltin<"__builtin_tabortwc">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_ppc_tabortwci : GCCBuiltin<"__builtin_tabortwci">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_ppc_tabortdc : GCCBuiltin<"__builtin_tabortdc">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_ppc_tabortdci : GCCBuiltin<"__builtin_tabortdci">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;

	def int_ppc_tcheck : GCCBuiltin<"__builtin_tcheck">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_ppc_treclaim : GCCBuiltin<"__builtin_treclaim">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
	def int_ppc_trechkpt : GCCBuiltin<"__builtin_trechkpt">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_ppc_tsr : GCCBuiltin<"__builtin_tsr">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;

	def int_ppc_get_texasr : GCCBuiltin<"__builtin_get_texasr">,
	Intrinsic<[llvm_i64_ty], [], []>;
	def int_ppc_get_texasru : GCCBuiltin<"__builtin_get_texasru">,
	Intrinsic<[llvm_i64_ty], [], []>;
	def int_ppc_get_tfhar : GCCBuiltin<"__builtin_get_tfhar">,
	Intrinsic<[llvm_i64_ty], [], []>;
	def int_ppc_get_tfiar : GCCBuiltin<"__builtin_get_tfiar">,
	Intrinsic<[llvm_i64_ty], [], []>;

	def int_ppc_set_texasr : GCCBuiltin<"__builtin_set_texasr">,
	Intrinsic<[], [llvm_i64_ty], []>;
	def int_ppc_set_texasru : GCCBuiltin<"__builtin_set_texasru">,
	Intrinsic<[], [llvm_i64_ty], []>;
	def int_ppc_set_tfhar : GCCBuiltin<"__builtin_set_tfhar">,
	Intrinsic<[], [llvm_i64_ty], []>;
	def int_ppc_set_tfiar : GCCBuiltin<"__builtin_set_tfiar">,
	Intrinsic<[], [llvm_i64_ty], []>;

	// Extended mnemonics
	def int_ppc_tendall : GCCBuiltin<"__builtin_tendall">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_ppc_tresume : GCCBuiltin<"__builtin_tresume">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_ppc_tsuspend : GCCBuiltin<"__builtin_tsuspend">,
	Intrinsic<[llvm_i32_ty], [], []>;

	def int_ppc_ttest : GCCBuiltin<"__builtin_ttest">,
	Intrinsic<[llvm_i64_ty], [], []>;

	def int_ppc_cfence : Intrinsic<[], [llvm_anyint_ty], []>;

	// PowerPC set FPSCR Intrinsic Definitions.
	def int_ppc_setrnd : GCCBuiltin<"__builtin_setrnd">,
	Intrinsic<[llvm_double_ty], [llvm_i32_ty], []>;

	}
	Index: head/contrib/llvm-project/llvm/include/llvm/Support/ManagedStatic.h
	===================================================================
	--- head/contrib/llvm-project/llvm/include/llvm/Support/ManagedStatic.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/include/llvm/Support/ManagedStatic.h (revision 362609)
	@@ -1,125 +1,125 @@
	//===-- llvm/Support/ManagedStatic.h - Static Global wrapper ----- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the ManagedStatic class and the llvm_shutdown() function.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_MANAGEDSTATIC_H
	#define LLVM_SUPPORT_MANAGEDSTATIC_H

	#include <atomic>
	#include <cstddef>

	namespace llvm {

	/// object_creator - Helper method for ManagedStatic.
	template <class C> struct object_creator {
	static void *call() { return new C(); }
	};

	/// object_deleter - Helper method for ManagedStatic.
	///
	template <typename T> struct object_deleter {
	static void call(void Ptr) { delete (T )Ptr; }
	};
	template <typename T, size_t N> struct object_deleter<T[N]> {
	static void call(void Ptr) { delete[](T )Ptr; }
	};

	// ManagedStatic must be initialized to zero, and it must not have a dynamic
	// initializer because managed statics are often created while running other
	// dynamic initializers. In standard C++11, the best way to accomplish this is
	// with a constexpr default constructor. However, different versions of the
	// Visual C++ compiler have had bugs where, even though the constructor may be
	// constexpr, a dynamic initializer may be emitted depending on optimization
	// settings. For the affected versions of MSVC, use the old linker
	// initialization pattern of not providing a constructor and leaving the fields
	-// uninitialized.
	-#if !defined(_MSC_VER) \|\| defined(__clang__)
	+// uninitialized. See http://llvm.org/PR41367 for details.
	+#if !defined(_MSC_VER) \|\| (_MSC_VER >= 1925) \|\| defined(__clang__)
	#define LLVM_USE_CONSTEXPR_CTOR
	#endif

	/// ManagedStaticBase - Common base class for ManagedStatic instances.
	class ManagedStaticBase {
	protected:
	#ifdef LLVM_USE_CONSTEXPR_CTOR
	mutable std::atomic<void *> Ptr{};
	mutable void (DeleterFn)(void ) = nullptr;
	mutable const ManagedStaticBase *Next = nullptr;
	#else
	// This should only be used as a static variable, which guarantees that this
	// will be zero initialized.
	mutable std::atomic<void *> Ptr;
	mutable void (DeleterFn)(void );
	mutable const ManagedStaticBase *Next;
	#endif

	void RegisterManagedStatic(void (creator)(), void (deleter)(void)) const;

	public:
	#ifdef LLVM_USE_CONSTEXPR_CTOR
	constexpr ManagedStaticBase() = default;
	#endif

	/// isConstructed - Return true if this object has not been created yet.
	bool isConstructed() const { return Ptr != nullptr; }

	void destroy() const;
	};

	/// ManagedStatic - This transparently changes the behavior of global statics to
	/// be lazily constructed on demand (good for reducing startup times of dynamic
	/// libraries that link in LLVM components) and for making destruction be
	/// explicit through the llvm_shutdown() function call.
	///
	template <class C, class Creator = object_creator<C>,
	class Deleter = object_deleter<C>>
	class ManagedStatic : public ManagedStaticBase {
	public:
	// Accessors.
	C &operator*() {
	void *Tmp = Ptr.load(std::memory_order_acquire);
	if (!Tmp)
	RegisterManagedStatic(Creator::call, Deleter::call);

	return static_cast<C >(Ptr.load(std::memory_order_relaxed));
	}

	C operator->() { return &*this; }

	const C &operator*() const {
	void *Tmp = Ptr.load(std::memory_order_acquire);
	if (!Tmp)
	RegisterManagedStatic(Creator::call, Deleter::call);

	return static_cast<C >(Ptr.load(std::memory_order_relaxed));
	}

	const C operator->() const { return &*this; }

	// Extract the instance, leaving the ManagedStatic uninitialized. The
	// user is then responsible for the lifetime of the returned instance.
	C *claim() {
	return static_cast<C *>(Ptr.exchange(nullptr));
	}
	};

	/// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
	void llvm_shutdown();

	/// llvm_shutdown_obj - This is a simple helper class that calls
	/// llvm_shutdown() when it is destroyed.
	struct llvm_shutdown_obj {
	llvm_shutdown_obj() = default;
	~llvm_shutdown_obj() { llvm_shutdown(); }
	};

	} // end namespace llvm

	#endif // LLVM_SUPPORT_MANAGEDSTATIC_H
	Index: head/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td
	===================================================================
	--- head/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td (revision 362609)
	@@ -1,1619 +1,1628 @@
	//===- TargetSelectionDAG.td - Common code for DAG isels ---- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the target-independent interfaces used by SelectionDAG
	// instruction selection generators.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Selection DAG Type Constraint definitions.
	//
	// Note that the semantics of these constraints are hard coded into tblgen. To
	// modify or add constraints, you have to hack tblgen.
	//

	class SDTypeConstraint<int opnum> {
	int OperandNum = opnum;
	}

	// SDTCisVT - The specified operand has exactly this VT.
	class SDTCisVT<int OpNum, ValueType vt> : SDTypeConstraint<OpNum> {
	ValueType VT = vt;
	}

	class SDTCisPtrTy<int OpNum> : SDTypeConstraint<OpNum>;

	// SDTCisInt - The specified operand has integer type.
	class SDTCisInt<int OpNum> : SDTypeConstraint<OpNum>;

	// SDTCisFP - The specified operand has floating-point type.
	class SDTCisFP<int OpNum> : SDTypeConstraint<OpNum>;

	// SDTCisVec - The specified operand has a vector type.
	class SDTCisVec<int OpNum> : SDTypeConstraint<OpNum>;

	// SDTCisSameAs - The two specified operands have identical types.
	class SDTCisSameAs<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> {
	int OtherOperandNum = OtherOp;
	}

	// SDTCisVTSmallerThanOp - The specified operand is a VT SDNode, and its type is
	// smaller than the 'Other' operand.
	class SDTCisVTSmallerThanOp<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> {
	int OtherOperandNum = OtherOp;
	}

	class SDTCisOpSmallerThanOp<int SmallOp, int BigOp> : SDTypeConstraint<SmallOp>{
	int BigOperandNum = BigOp;
	}

	/// SDTCisEltOfVec - This indicates that ThisOp is a scalar type of the same
	/// type as the element type of OtherOp, which is a vector type.
	class SDTCisEltOfVec<int ThisOp, int OtherOp>
	: SDTypeConstraint<ThisOp> {
	int OtherOpNum = OtherOp;
	}

	/// SDTCisSubVecOfVec - This indicates that ThisOp is a vector type
	/// with length less that of OtherOp, which is a vector type.
	class SDTCisSubVecOfVec<int ThisOp, int OtherOp>
	: SDTypeConstraint<ThisOp> {
	int OtherOpNum = OtherOp;
	}

	// SDTCVecEltisVT - The specified operand is vector type with element type
	// of VT.
	class SDTCVecEltisVT<int OpNum, ValueType vt> : SDTypeConstraint<OpNum> {
	ValueType VT = vt;
	}

	// SDTCisSameNumEltsAs - The two specified operands have identical number
	// of elements.
	class SDTCisSameNumEltsAs<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> {
	int OtherOperandNum = OtherOp;
	}

	// SDTCisSameSizeAs - The two specified operands have identical size.
	class SDTCisSameSizeAs<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> {
	int OtherOperandNum = OtherOp;
	}

	//===----------------------------------------------------------------------===//
	// Selection DAG Type Profile definitions.
	//
	// These use the constraints defined above to describe the type requirements of
	// the various nodes. These are not hard coded into tblgen, allowing targets to
	// add their own if needed.
	//

	// SDTypeProfile - This profile describes the type requirements of a Selection
	// DAG node.
	class SDTypeProfile<int numresults, int numoperands,
	list<SDTypeConstraint> constraints> {
	int NumResults = numresults;
	int NumOperands = numoperands;
	list<SDTypeConstraint> Constraints = constraints;
	}

	// Builtin profiles.
	def SDTIntLeaf: SDTypeProfile<1, 0, [SDTCisInt<0>]>; // for 'imm'.
	def SDTFPLeaf : SDTypeProfile<1, 0, [SDTCisFP<0>]>; // for 'fpimm'.
	def SDTPtrLeaf: SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; // for '&g'.
	def SDTOther : SDTypeProfile<1, 0, [SDTCisVT<0, OtherVT>]>; // for 'vt'.
	def SDTUNDEF : SDTypeProfile<1, 0, []>; // for 'undef'.
	def SDTUnaryOp : SDTypeProfile<1, 1, []>; // for bitconvert.

	def SDTIntBinOp : SDTypeProfile<1, 2, [ // add, and, or, xor, udiv, etc.
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>
	]>;
	def SDTIntShiftOp : SDTypeProfile<1, 2, [ // shl, sra, srl
	SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>
	]>;
	def SDTIntShiftDOp: SDTypeProfile<1, 3, [ // fshl, fshr
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
	]>;
	def SDTIntSatNoShOp : SDTypeProfile<1, 2, [ // ssat with no shift
	SDTCisSameAs<0, 1>, SDTCisInt<2>
	]>;
	def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0>
	]>;
	def SDTIntScaledBinOp : SDTypeProfile<1, 3, [ // smulfix, sdivfix, etc
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
	]>;

	def SDTFPBinOp : SDTypeProfile<1, 2, [ // fadd, fmul, etc.
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>
	]>;
	def SDTFPSignOp : SDTypeProfile<1, 2, [ // fcopysign.
	SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisFP<2>
	]>;
	def SDTFPTernaryOp : SDTypeProfile<1, 3, [ // fmadd, fnmsub, etc.
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0>
	]>;
	def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // bitreverse
	SDTCisSameAs<0, 1>, SDTCisInt<0>
	]>;
	def SDTIntBitCountUnaryOp : SDTypeProfile<1, 1, [ // ctlz, cttz
	SDTCisInt<0>, SDTCisInt<1>
	]>;
	def SDTIntExtendOp : SDTypeProfile<1, 1, [ // sext, zext, anyext
	SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
	]>;
	def SDTIntTruncOp : SDTypeProfile<1, 1, [ // trunc
	SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
	]>;
	def SDTFPUnaryOp : SDTypeProfile<1, 1, [ // fneg, fsqrt, etc
	SDTCisSameAs<0, 1>, SDTCisFP<0>
	]>;
	def SDTFPRoundOp : SDTypeProfile<1, 1, [ // fround
	SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
	]>;
	def SDTFPExtendOp : SDTypeProfile<1, 1, [ // fextend
	SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
	]>;
	def SDTIntToFPOp : SDTypeProfile<1, 1, [ // [su]int_to_fp
	SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>
	]>;
	def SDTFPToIntOp : SDTypeProfile<1, 1, [ // fp_to_[su]int
	SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>
	]>;
	def SDTExtInreg : SDTypeProfile<1, 2, [ // sext_inreg
	SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>,
	SDTCisVTSmallerThanOp<2, 1>
	]>;
	def SDTExtInvec : SDTypeProfile<1, 1, [ // sext_invec
	SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>,
	SDTCisOpSmallerThanOp<1, 0>
	]>;

	def SDTSetCC : SDTypeProfile<1, 3, [ // setcc
	SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
	]>;

	def SDTSelect : SDTypeProfile<1, 3, [ // select
	SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>
	]>;

	def SDTVSelect : SDTypeProfile<1, 3, [ // vselect
	SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1>
	]>;

	def SDTSelectCC : SDTypeProfile<1, 5, [ // select_cc
	SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, SDTCisSameAs<0, 3>,
	SDTCisVT<5, OtherVT>
	]>;

	def SDTBr : SDTypeProfile<0, 1, [ // br
	SDTCisVT<0, OtherVT>
	]>;

	def SDTBrCC : SDTypeProfile<0, 4, [ // brcc
	SDTCisVT<0, OtherVT>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
	]>;

	def SDTBrcond : SDTypeProfile<0, 2, [ // brcond
	SDTCisInt<0>, SDTCisVT<1, OtherVT>
	]>;

	def SDTBrind : SDTypeProfile<0, 1, [ // brind
	SDTCisPtrTy<0>
	]>;

	def SDTCatchret : SDTypeProfile<0, 2, [ // catchret
	SDTCisVT<0, OtherVT>, SDTCisVT<1, OtherVT>
	]>;

	def SDTNone : SDTypeProfile<0, 0, []>; // ret, trap

	def SDTLoad : SDTypeProfile<1, 1, [ // load
	SDTCisPtrTy<1>
	]>;

	def SDTStore : SDTypeProfile<0, 2, [ // store
	SDTCisPtrTy<1>
	]>;

	def SDTIStore : SDTypeProfile<1, 3, [ // indexed store
	SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
	]>;

	def SDTMaskedStore: SDTypeProfile<0, 4, [ // masked store
	SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameNumEltsAs<0, 3>
	]>;

	def SDTMaskedLoad: SDTypeProfile<1, 4, [ // masked load
	SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameAs<0, 4>,
	SDTCisSameNumEltsAs<0, 3>
	]>;

	def SDTVecShuffle : SDTypeProfile<1, 2, [
	SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
	]>;
	def SDTVecExtract : SDTypeProfile<1, 2, [ // vector extract
	SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2>
	]>;
	def SDTVecInsert : SDTypeProfile<1, 3, [ // vector insert
	SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3>
	]>;
	def SDTVecReduce : SDTypeProfile<1, 1, [ // vector reduction
	SDTCisInt<0>, SDTCisVec<1>
	]>;

	def SDTSubVecExtract : SDTypeProfile<1, 2, [// subvector extract
	SDTCisSubVecOfVec<0,1>, SDTCisInt<2>
	]>;
	def SDTSubVecInsert : SDTypeProfile<1, 3, [ // subvector insert
	SDTCisSubVecOfVec<2, 1>, SDTCisSameAs<0,1>, SDTCisInt<3>
	]>;

	def SDTPrefetch : SDTypeProfile<0, 4, [ // prefetch
	SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisInt<1>
	]>;

	def SDTMemBarrier : SDTypeProfile<0, 5, [ // memory barrier
	SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, SDTCisSameAs<0,4>,
	SDTCisInt<0>
	]>;
	def SDTAtomicFence : SDTypeProfile<0, 2, [
	SDTCisSameAs<0,1>, SDTCisPtrTy<0>
	]>;
	def SDTAtomic3 : SDTypeProfile<1, 3, [
	SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, SDTCisInt<0>, SDTCisPtrTy<1>
	]>;
	def SDTAtomic2 : SDTypeProfile<1, 2, [
	SDTCisSameAs<0,2>, SDTCisInt<0>, SDTCisPtrTy<1>
	]>;

	def SDTFPAtomic2 : SDTypeProfile<1, 2, [
	SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
	]>;

	def SDTAtomicStore : SDTypeProfile<0, 2, [
	SDTCisPtrTy<0>, SDTCisInt<1>
	]>;
	def SDTAtomicLoad : SDTypeProfile<1, 1, [
	SDTCisInt<0>, SDTCisPtrTy<1>
	]>;

	def SDTConvertOp : SDTypeProfile<1, 5, [ //cvtss, su, us, uu, ff, fs, fu, sf, su
	SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>, SDTCisPtrTy<4>, SDTCisPtrTy<5>
	]>;

	class SDCallSeqStart<list<SDTypeConstraint> constraints> :
	SDTypeProfile<0, 2, constraints>;
	class SDCallSeqEnd<list<SDTypeConstraint> constraints> :
	SDTypeProfile<0, 2, constraints>;

	//===----------------------------------------------------------------------===//
	// Selection DAG Node definitions.
	//
	class SDNode<string opcode, SDTypeProfile typeprof,
	list<SDNodeProperty> props = [], string sdclass = "SDNode">
	: SDPatternOperator {
	string Opcode = opcode;
	string SDClass = sdclass;
	let Properties = props;
	SDTypeProfile TypeProfile = typeprof;
	}

	// Special TableGen-recognized dag nodes
	def set;
	def implicit;
	def node;
	def srcvalue;

	def imm : SDNode<"ISD::Constant" , SDTIntLeaf , [], "ConstantSDNode">;
	def timm : SDNode<"ISD::TargetConstant",SDTIntLeaf, [], "ConstantSDNode">;
	def fpimm : SDNode<"ISD::ConstantFP", SDTFPLeaf , [], "ConstantFPSDNode">;
	def vt : SDNode<"ISD::VALUETYPE" , SDTOther , [], "VTSDNode">;
	def bb : SDNode<"ISD::BasicBlock", SDTOther , [], "BasicBlockSDNode">;
	def cond : SDNode<"ISD::CONDCODE" , SDTOther , [], "CondCodeSDNode">;
	def undef : SDNode<"ISD::UNDEF" , SDTUNDEF , []>;
	def globaladdr : SDNode<"ISD::GlobalAddress", SDTPtrLeaf, [],
	"GlobalAddressSDNode">;
	def tglobaladdr : SDNode<"ISD::TargetGlobalAddress", SDTPtrLeaf, [],
	"GlobalAddressSDNode">;
	def globaltlsaddr : SDNode<"ISD::GlobalTLSAddress", SDTPtrLeaf, [],
	"GlobalAddressSDNode">;
	def tglobaltlsaddr : SDNode<"ISD::TargetGlobalTLSAddress", SDTPtrLeaf, [],
	"GlobalAddressSDNode">;
	def constpool : SDNode<"ISD::ConstantPool", SDTPtrLeaf, [],
	"ConstantPoolSDNode">;
	def tconstpool : SDNode<"ISD::TargetConstantPool", SDTPtrLeaf, [],
	"ConstantPoolSDNode">;
	def jumptable : SDNode<"ISD::JumpTable", SDTPtrLeaf, [],
	"JumpTableSDNode">;
	def tjumptable : SDNode<"ISD::TargetJumpTable", SDTPtrLeaf, [],
	"JumpTableSDNode">;
	def frameindex : SDNode<"ISD::FrameIndex", SDTPtrLeaf, [],
	"FrameIndexSDNode">;
	def tframeindex : SDNode<"ISD::TargetFrameIndex", SDTPtrLeaf, [],
	"FrameIndexSDNode">;
	def externalsym : SDNode<"ISD::ExternalSymbol", SDTPtrLeaf, [],
	"ExternalSymbolSDNode">;
	def texternalsym: SDNode<"ISD::TargetExternalSymbol", SDTPtrLeaf, [],
	"ExternalSymbolSDNode">;
	def mcsym: SDNode<"ISD::MCSymbol", SDTPtrLeaf, [], "MCSymbolSDNode">;
	def blockaddress : SDNode<"ISD::BlockAddress", SDTPtrLeaf, [],
	"BlockAddressSDNode">;
	def tblockaddress: SDNode<"ISD::TargetBlockAddress", SDTPtrLeaf, [],
	"BlockAddressSDNode">;

	def add : SDNode<"ISD::ADD" , SDTIntBinOp ,
	[SDNPCommutative, SDNPAssociative]>;
	def sub : SDNode<"ISD::SUB" , SDTIntBinOp>;
	def mul : SDNode<"ISD::MUL" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def mulhs : SDNode<"ISD::MULHS" , SDTIntBinOp, [SDNPCommutative]>;
	def mulhu : SDNode<"ISD::MULHU" , SDTIntBinOp, [SDNPCommutative]>;
	def smullohi : SDNode<"ISD::SMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>;
	def umullohi : SDNode<"ISD::UMUL_LOHI" , SDTIntBinHiLoOp, [SDNPCommutative]>;
	def sdiv : SDNode<"ISD::SDIV" , SDTIntBinOp>;
	def udiv : SDNode<"ISD::UDIV" , SDTIntBinOp>;
	def srem : SDNode<"ISD::SREM" , SDTIntBinOp>;
	def urem : SDNode<"ISD::UREM" , SDTIntBinOp>;
	def sdivrem : SDNode<"ISD::SDIVREM" , SDTIntBinHiLoOp>;
	def udivrem : SDNode<"ISD::UDIVREM" , SDTIntBinHiLoOp>;
	def srl : SDNode<"ISD::SRL" , SDTIntShiftOp>;
	def sra : SDNode<"ISD::SRA" , SDTIntShiftOp>;
	def shl : SDNode<"ISD::SHL" , SDTIntShiftOp>;
	def rotl : SDNode<"ISD::ROTL" , SDTIntShiftOp>;
	def rotr : SDNode<"ISD::ROTR" , SDTIntShiftOp>;
	def fshl : SDNode<"ISD::FSHL" , SDTIntShiftDOp>;
	def fshr : SDNode<"ISD::FSHR" , SDTIntShiftDOp>;
	def and : SDNode<"ISD::AND" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def or : SDNode<"ISD::OR" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def xor : SDNode<"ISD::XOR" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def addc : SDNode<"ISD::ADDC" , SDTIntBinOp,
	[SDNPCommutative, SDNPOutGlue]>;
	def adde : SDNode<"ISD::ADDE" , SDTIntBinOp,
	[SDNPCommutative, SDNPOutGlue, SDNPInGlue]>;
	def subc : SDNode<"ISD::SUBC" , SDTIntBinOp,
	[SDNPOutGlue]>;
	def sube : SDNode<"ISD::SUBE" , SDTIntBinOp,
	[SDNPOutGlue, SDNPInGlue]>;
	def smin : SDNode<"ISD::SMIN" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def smax : SDNode<"ISD::SMAX" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def umin : SDNode<"ISD::UMIN" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def umax : SDNode<"ISD::UMAX" , SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]>;

	def saddsat : SDNode<"ISD::SADDSAT" , SDTIntBinOp, [SDNPCommutative]>;
	def uaddsat : SDNode<"ISD::UADDSAT" , SDTIntBinOp, [SDNPCommutative]>;
	def ssubsat : SDNode<"ISD::SSUBSAT" , SDTIntBinOp>;
	def usubsat : SDNode<"ISD::USUBSAT" , SDTIntBinOp>;

	def smulfix : SDNode<"ISD::SMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
	def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
	def umulfix : SDNode<"ISD::UMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
	def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
	def sdivfix : SDNode<"ISD::SDIVFIX" , SDTIntScaledBinOp>;
	def udivfix : SDNode<"ISD::UDIVFIX" , SDTIntScaledBinOp>;

	def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
	def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
	def zext_invec : SDNode<"ISD::ZERO_EXTEND_VECTOR_INREG", SDTExtInvec>;

	def abs : SDNode<"ISD::ABS" , SDTIntUnaryOp>;
	def bitreverse : SDNode<"ISD::BITREVERSE" , SDTIntUnaryOp>;
	def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>;
	def ctlz : SDNode<"ISD::CTLZ" , SDTIntBitCountUnaryOp>;
	def cttz : SDNode<"ISD::CTTZ" , SDTIntBitCountUnaryOp>;
	def ctpop : SDNode<"ISD::CTPOP" , SDTIntBitCountUnaryOp>;
	def ctlz_zero_undef : SDNode<"ISD::CTLZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>;
	def cttz_zero_undef : SDNode<"ISD::CTTZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>;
	def sext : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>;
	def zext : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>;
	def anyext : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>;
	def trunc : SDNode<"ISD::TRUNCATE" , SDTIntTruncOp>;
	def bitconvert : SDNode<"ISD::BITCAST" , SDTUnaryOp>;
	def addrspacecast : SDNode<"ISD::ADDRSPACECAST", SDTUnaryOp>;
	def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>;
	def insertelt : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>;

	def vecreduce_add : SDNode<"ISD::VECREDUCE_ADD", SDTVecReduce>;
	def vecreduce_smax : SDNode<"ISD::VECREDUCE_SMAX", SDTVecReduce>;
	def vecreduce_umax : SDNode<"ISD::VECREDUCE_UMAX", SDTVecReduce>;
	def vecreduce_smin : SDNode<"ISD::VECREDUCE_SMIN", SDTVecReduce>;
	def vecreduce_umin : SDNode<"ISD::VECREDUCE_UMIN", SDTVecReduce>;

	def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>;
	def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>;
	def fmul : SDNode<"ISD::FMUL" , SDTFPBinOp, [SDNPCommutative]>;
	def fdiv : SDNode<"ISD::FDIV" , SDTFPBinOp>;
	def frem : SDNode<"ISD::FREM" , SDTFPBinOp>;
	def fma : SDNode<"ISD::FMA" , SDTFPTernaryOp>;
	def fmad : SDNode<"ISD::FMAD" , SDTFPTernaryOp>;
	def fabs : SDNode<"ISD::FABS" , SDTFPUnaryOp>;
	def fminnum : SDNode<"ISD::FMINNUM" , SDTFPBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def fmaxnum : SDNode<"ISD::FMAXNUM" , SDTFPBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
	[SDNPCommutative]>;
	def fmaxnum_ieee : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
	[SDNPCommutative]>;
	def fminimum : SDNode<"ISD::FMINIMUM" , SDTFPBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def fmaximum : SDNode<"ISD::FMAXIMUM" , SDTFPBinOp,
	[SDNPCommutative, SDNPAssociative]>;
	def fgetsign : SDNode<"ISD::FGETSIGN" , SDTFPToIntOp>;
	def fcanonicalize : SDNode<"ISD::FCANONICALIZE", SDTFPUnaryOp>;
	def fneg : SDNode<"ISD::FNEG" , SDTFPUnaryOp>;
	def fsqrt : SDNode<"ISD::FSQRT" , SDTFPUnaryOp>;
	def fsin : SDNode<"ISD::FSIN" , SDTFPUnaryOp>;
	def fcos : SDNode<"ISD::FCOS" , SDTFPUnaryOp>;
	def fexp2 : SDNode<"ISD::FEXP2" , SDTFPUnaryOp>;
	def fpow : SDNode<"ISD::FPOW" , SDTFPBinOp>;
	def flog2 : SDNode<"ISD::FLOG2" , SDTFPUnaryOp>;
	def frint : SDNode<"ISD::FRINT" , SDTFPUnaryOp>;
	def ftrunc : SDNode<"ISD::FTRUNC" , SDTFPUnaryOp>;
	def fceil : SDNode<"ISD::FCEIL" , SDTFPUnaryOp>;
	def ffloor : SDNode<"ISD::FFLOOR" , SDTFPUnaryOp>;
	def fnearbyint : SDNode<"ISD::FNEARBYINT" , SDTFPUnaryOp>;
	def fround : SDNode<"ISD::FROUND" , SDTFPUnaryOp>;

	def lround : SDNode<"ISD::LROUND" , SDTFPToIntOp>;
	def llround : SDNode<"ISD::LLROUND" , SDTFPToIntOp>;
	def lrint : SDNode<"ISD::LRINT" , SDTFPToIntOp>;
	def llrint : SDNode<"ISD::LLRINT" , SDTFPToIntOp>;

	def fpround : SDNode<"ISD::FP_ROUND" , SDTFPRoundOp>;
	def fpextend : SDNode<"ISD::FP_EXTEND" , SDTFPExtendOp>;
	def fcopysign : SDNode<"ISD::FCOPYSIGN" , SDTFPSignOp>;

	def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>;
	def uint_to_fp : SDNode<"ISD::UINT_TO_FP" , SDTIntToFPOp>;
	def fp_to_sint : SDNode<"ISD::FP_TO_SINT" , SDTFPToIntOp>;
	def fp_to_uint : SDNode<"ISD::FP_TO_UINT" , SDTFPToIntOp>;
	def f16_to_fp : SDNode<"ISD::FP16_TO_FP" , SDTIntToFPOp>;
	def fp_to_f16 : SDNode<"ISD::FP_TO_FP16" , SDTFPToIntOp>;

	def strict_fadd : SDNode<"ISD::STRICT_FADD",
	SDTFPBinOp, [SDNPHasChain, SDNPCommutative]>;
	def strict_fsub : SDNode<"ISD::STRICT_FSUB",
	SDTFPBinOp, [SDNPHasChain]>;
	def strict_fmul : SDNode<"ISD::STRICT_FMUL",
	SDTFPBinOp, [SDNPHasChain, SDNPCommutative]>;
	def strict_fdiv : SDNode<"ISD::STRICT_FDIV",
	SDTFPBinOp, [SDNPHasChain]>;
	def strict_frem : SDNode<"ISD::STRICT_FREM",
	SDTFPBinOp, [SDNPHasChain]>;
	def strict_fma : SDNode<"ISD::STRICT_FMA",
	SDTFPTernaryOp, [SDNPHasChain]>;
	def strict_fsqrt : SDNode<"ISD::STRICT_FSQRT",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_fsin : SDNode<"ISD::STRICT_FSIN",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_fcos : SDNode<"ISD::STRICT_FCOS",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_fexp2 : SDNode<"ISD::STRICT_FEXP2",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_fpow : SDNode<"ISD::STRICT_FPOW",
	SDTFPBinOp, [SDNPHasChain]>;
	def strict_flog2 : SDNode<"ISD::STRICT_FLOG2",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_frint : SDNode<"ISD::STRICT_FRINT",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_lrint : SDNode<"ISD::STRICT_LRINT",
	SDTFPToIntOp, [SDNPHasChain]>;
	def strict_llrint : SDNode<"ISD::STRICT_LLRINT",
	SDTFPToIntOp, [SDNPHasChain]>;
	def strict_fnearbyint : SDNode<"ISD::STRICT_FNEARBYINT",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_fceil : SDNode<"ISD::STRICT_FCEIL",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_ffloor : SDNode<"ISD::STRICT_FFLOOR",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_lround : SDNode<"ISD::STRICT_LROUND",
	SDTFPToIntOp, [SDNPHasChain]>;
	def strict_llround : SDNode<"ISD::STRICT_LLROUND",
	SDTFPToIntOp, [SDNPHasChain]>;
	def strict_fround : SDNode<"ISD::STRICT_FROUND",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_ftrunc : SDNode<"ISD::STRICT_FTRUNC",
	SDTFPUnaryOp, [SDNPHasChain]>;
	def strict_fminnum : SDNode<"ISD::STRICT_FMINNUM",
	SDTFPBinOp, [SDNPHasChain,
	SDNPCommutative, SDNPAssociative]>;
	def strict_fmaxnum : SDNode<"ISD::STRICT_FMAXNUM",
	SDTFPBinOp, [SDNPHasChain,
	SDNPCommutative, SDNPAssociative]>;
	def strict_fminimum : SDNode<"ISD::STRICT_FMINIMUM",
	SDTFPBinOp, [SDNPHasChain,
	SDNPCommutative, SDNPAssociative]>;
	def strict_fmaximum : SDNode<"ISD::STRICT_FMAXIMUM",
	SDTFPBinOp, [SDNPHasChain,
	SDNPCommutative, SDNPAssociative]>;
	def strict_fpround : SDNode<"ISD::STRICT_FP_ROUND",
	SDTFPRoundOp, [SDNPHasChain]>;
	def strict_fpextend : SDNode<"ISD::STRICT_FP_EXTEND",
	SDTFPExtendOp, [SDNPHasChain]>;
	def strict_fp_to_sint : SDNode<"ISD::STRICT_FP_TO_SINT",
	SDTFPToIntOp, [SDNPHasChain]>;
	def strict_fp_to_uint : SDNode<"ISD::STRICT_FP_TO_UINT",
	SDTFPToIntOp, [SDNPHasChain]>;
	def strict_sint_to_fp : SDNode<"ISD::STRICT_SINT_TO_FP",
	SDTIntToFPOp, [SDNPHasChain]>;
	def strict_uint_to_fp : SDNode<"ISD::STRICT_UINT_TO_FP",
	SDTIntToFPOp, [SDNPHasChain]>;

	def setcc : SDNode<"ISD::SETCC" , SDTSetCC>;
	def select : SDNode<"ISD::SELECT" , SDTSelect>;
	def vselect : SDNode<"ISD::VSELECT" , SDTVSelect>;
	def selectcc : SDNode<"ISD::SELECT_CC" , SDTSelectCC>;

	def brcc : SDNode<"ISD::BR_CC" , SDTBrCC, [SDNPHasChain]>;
	def brcond : SDNode<"ISD::BRCOND" , SDTBrcond, [SDNPHasChain]>;
	def brind : SDNode<"ISD::BRIND" , SDTBrind, [SDNPHasChain]>;
	def br : SDNode<"ISD::BR" , SDTBr, [SDNPHasChain]>;
	def catchret : SDNode<"ISD::CATCHRET" , SDTCatchret,
	[SDNPHasChain, SDNPSideEffect]>;
	def cleanupret : SDNode<"ISD::CLEANUPRET" , SDTNone, [SDNPHasChain]>;
	def catchpad : SDNode<"ISD::CATCHPAD" , SDTNone,
	[SDNPHasChain, SDNPSideEffect]>;

	def trap : SDNode<"ISD::TRAP" , SDTNone,
	[SDNPHasChain, SDNPSideEffect]>;
	def debugtrap : SDNode<"ISD::DEBUGTRAP" , SDTNone,
	[SDNPHasChain, SDNPSideEffect]>;

	def prefetch : SDNode<"ISD::PREFETCH" , SDTPrefetch,
	[SDNPHasChain, SDNPMayLoad, SDNPMayStore,
	SDNPMemOperand]>;

	def readcyclecounter : SDNode<"ISD::READCYCLECOUNTER", SDTIntLeaf,
	[SDNPHasChain, SDNPSideEffect]>;

	def atomic_fence : SDNode<"ISD::ATOMIC_FENCE" , SDTAtomicFence,
	[SDNPHasChain, SDNPSideEffect]>;

	def atomic_cmp_swap : SDNode<"ISD::ATOMIC_CMP_SWAP" , SDTAtomic3,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_add : SDNode<"ISD::ATOMIC_LOAD_ADD" , SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_swap : SDNode<"ISD::ATOMIC_SWAP", SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_sub : SDNode<"ISD::ATOMIC_LOAD_SUB" , SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_and : SDNode<"ISD::ATOMIC_LOAD_AND" , SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_clr : SDNode<"ISD::ATOMIC_LOAD_CLR" , SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_or : SDNode<"ISD::ATOMIC_LOAD_OR" , SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_xor : SDNode<"ISD::ATOMIC_LOAD_XOR" , SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_nand: SDNode<"ISD::ATOMIC_LOAD_NAND", SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_min : SDNode<"ISD::ATOMIC_LOAD_MIN", SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_max : SDNode<"ISD::ATOMIC_LOAD_MAX", SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_umin : SDNode<"ISD::ATOMIC_LOAD_UMIN", SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_umax : SDNode<"ISD::ATOMIC_LOAD_UMAX", SDTAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_fadd : SDNode<"ISD::ATOMIC_LOAD_FADD" , SDTFPAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_load_fsub : SDNode<"ISD::ATOMIC_LOAD_FSUB" , SDTFPAtomic2,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;

	def atomic_load : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
	[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	def masked_st : SDNode<"ISD::MSTORE", SDTMaskedStore,
	[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def masked_ld : SDNode<"ISD::MLOAD", SDTMaskedLoad,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

	// Do not use ld, st directly. Use load, extload, sextload, zextload, store,
	// and truncst (see below).
	def ld : SDNode<"ISD::LOAD" , SDTLoad,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def st : SDNode<"ISD::STORE" , SDTStore,
	[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def ist : SDNode<"ISD::STORE" , SDTIStore,
	[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>;
	def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
	def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
	[]>;

	// vector_extract/vector_insert are deprecated. extractelt/insertelt
	// are preferred.
	def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
	SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>;
	def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
	SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
	def concat_vectors : SDNode<"ISD::CONCAT_VECTORS",
	SDTypeProfile<1, 2, [SDTCisSubVecOfVec<1, 0>, SDTCisSameAs<1, 2>]>,[]>;

	// This operator does not do subvector type checking. The ARM
	// backend, at least, needs it.
	def vector_extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
	SDTypeProfile<1, 2, [SDTCisInt<2>, SDTCisVec<1>, SDTCisVec<0>]>,
	[]>;

	// This operator does subvector type checking.
	def extract_subvector : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTSubVecExtract, []>;
	def insert_subvector : SDNode<"ISD::INSERT_SUBVECTOR", SDTSubVecInsert, []>;

	// Nodes for intrinsics, you should use the intrinsic itself and let tblgen use
	// these internally. Don't reference these directly.
	def intrinsic_void : SDNode<"ISD::INTRINSIC_VOID",
	SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain]>;
	def intrinsic_w_chain : SDNode<"ISD::INTRINSIC_W_CHAIN",
	SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>,
	[SDNPHasChain]>;
	def intrinsic_wo_chain : SDNode<"ISD::INTRINSIC_WO_CHAIN",
	SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, []>;

	def SDT_assertext : SDTypeProfile<1, 1,
	[SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
	def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
	def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;


	//===----------------------------------------------------------------------===//
	// Selection DAG Condition Codes

	class CondCode<string fcmpName = "", string icmpName = ""> {
	string ICmpPredicate = icmpName;
	string FCmpPredicate = fcmpName;
	}

	// ISD::CondCode enums, and mapping to CmpInst::Predicate names
	def SETOEQ : CondCode<"FCMP_OEQ">;
	def SETOGT : CondCode<"FCMP_OGT">;
	def SETOGE : CondCode<"FCMP_OGE">;
	def SETOLT : CondCode<"FCMP_OLT">;
	def SETOLE : CondCode<"FCMP_OLE">;
	def SETONE : CondCode<"FCMP_ONE">;
	def SETO : CondCode<"FCMP_ORD">;
	def SETUO : CondCode<"FCMP_UNO">;
	def SETUEQ : CondCode<"FCMP_UEQ">;
	def SETUGT : CondCode<"FCMP_UGT", "ICMP_UGT">;
	def SETUGE : CondCode<"FCMP_UGE", "ICMP_UGE">;
	def SETULT : CondCode<"FCMP_ULT", "ICMP_ULT">;
	def SETULE : CondCode<"FCMP_ULE", "ICMP_ULE">;
	def SETUNE : CondCode<"FCMP_UNE">;
	def SETEQ : CondCode<"", "ICMP_EQ">;
	def SETGT : CondCode<"", "ICMP_SGT">;
	def SETGE : CondCode<"", "ICMP_SGE">;
	def SETLT : CondCode<"", "ICMP_SLT">;
	def SETLE : CondCode<"", "ICMP_SLE">;
	def SETNE : CondCode<"", "ICMP_NE">;

	//===----------------------------------------------------------------------===//
	// Selection DAG Node Transformation Functions.
	//
	// This mechanism allows targets to manipulate nodes in the output DAG once a
	// match has been formed. This is typically used to manipulate immediate
	// values.
	//
	class SDNodeXForm<SDNode opc, code xformFunction> {
	SDNode Opcode = opc;
	code XFormFunction = xformFunction;
	}

	def NOOP_SDNodeXForm : SDNodeXForm<imm, [{}]>;

	//===----------------------------------------------------------------------===//
	// PatPred Subclasses.
	//
	// These allow specifying different sorts of predicates that control whether a
	// node is matched.
	//
	class PatPred;

	class CodePatPred<code predicate> : PatPred {
	code PredicateCode = predicate;
	}


	//===----------------------------------------------------------------------===//
	// Selection DAG Pattern Fragments.
	//
	// Pattern fragments are reusable chunks of dags that match specific things.
	// They can take arguments and have C++ predicates that control whether they
	// match. They are intended to make the patterns for common instructions more
	// compact and readable.
	//

	/// PatFrags - Represents a set of pattern fragments. Each single fragment
	/// can match something on the DAG, from a single node to multiple nested other
	/// fragments. The whole set of fragments matches if any of the single
	/// fragemnts match. This allows e.g. matching and "add with overflow" and
	/// a regular "add" with the same fragment set.
	///
	class PatFrags<dag ops, list<dag> frags, code pred = [{}],
	SDNodeXForm xform = NOOP_SDNodeXForm> : SDPatternOperator {
	dag Operands = ops;
	list<dag> Fragments = frags;
	code PredicateCode = pred;
	code GISelPredicateCode = [{}];
	code ImmediateCode = [{}];
	SDNodeXForm OperandTransform = xform;

	// When this is set, the PredicateCode may refer to a constant Operands
	// vector which contains the captured nodes of the DAG, in the order listed
	// by the Operands field above.
	//
	// This is useful when Fragments involves associative / commutative
	// operators: a single piece of code can easily refer to all operands even
	// when re-associated / commuted variants of the fragment are matched.
	bit PredicateCodeUsesOperands = 0;

	// Define a few pre-packaged predicates. This helps GlobalISel import
	// existing rules from SelectionDAG for many common cases.
	// They will be tested prior to the code in pred and must not be used in
	// ImmLeaf and its subclasses.

	// Is the desired pre-packaged predicate for a load?
	bit IsLoad = ?;
	// Is the desired pre-packaged predicate for a store?
	bit IsStore = ?;
	// Is the desired pre-packaged predicate for an atomic?
	bit IsAtomic = ?;

	// cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
	// cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
	bit IsUnindexed = ?;

	// cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD
	bit IsNonExtLoad = ?;
	// cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
	bit IsAnyExtLoad = ?;
	// cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
	bit IsSignExtLoad = ?;
	// cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
	bit IsZeroExtLoad = ?;
	// !cast<StoreSDNode>(N)->isTruncatingStore();
	// cast<StoreSDNode>(N)->isTruncatingStore();
	bit IsTruncStore = ?;

	// cast<MemSDNode>(N)->getAddressSpace() ==
	// If this empty, accept any address space.
	list<int> AddressSpaces = ?;

	// cast<MemSDNode>(N)->getAlignment() >=
	// If this is empty, accept any alignment.
	int MinAlignment = ?;

	// cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::Monotonic
	bit IsAtomicOrderingMonotonic = ?;
	// cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::Acquire
	bit IsAtomicOrderingAcquire = ?;
	// cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::Release
	bit IsAtomicOrderingRelease = ?;
	// cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::AcquireRelease
	bit IsAtomicOrderingAcquireRelease = ?;
	// cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::SequentiallyConsistent
	bit IsAtomicOrderingSequentiallyConsistent = ?;

	// isAcquireOrStronger(cast<AtomicSDNode>(N)->getOrdering())
	// !isAcquireOrStronger(cast<AtomicSDNode>(N)->getOrdering())
	bit IsAtomicOrderingAcquireOrStronger = ?;

	// isReleaseOrStronger(cast<AtomicSDNode>(N)->getOrdering())
	// !isReleaseOrStronger(cast<AtomicSDNode>(N)->getOrdering())
	bit IsAtomicOrderingReleaseOrStronger = ?;

	// cast<LoadSDNode>(N)->getMemoryVT() == MVT::<VT>;
	// cast<StoreSDNode>(N)->getMemoryVT() == MVT::<VT>;
	ValueType MemoryVT = ?;
	// cast<LoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::<VT>;
	// cast<StoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::<VT>;
	ValueType ScalarMemoryVT = ?;
	}

	// PatFrag - A version of PatFrags matching only a single fragment.
	class PatFrag<dag ops, dag frag, code pred = [{}],
	SDNodeXForm xform = NOOP_SDNodeXForm>
	: PatFrags<ops, [frag], pred, xform>;

	// OutPatFrag is a pattern fragment that is used as part of an output pattern
	// (not an input pattern). These do not have predicates or transforms, but are
	// used to avoid repeated subexpressions in output patterns.
	class OutPatFrag<dag ops, dag frag>
	: PatFrag<ops, frag, [{}], NOOP_SDNodeXForm>;

	// PatLeaf's are pattern fragments that have no operands. This is just a helper
	// to define immediates and other common things concisely.
	class PatLeaf<dag frag, code pred = [{}], SDNodeXForm xform = NOOP_SDNodeXForm>
	: PatFrag<(ops), frag, pred, xform>;


	// ImmLeaf is a pattern fragment with a constraint on the immediate. The
	// constraint is a function that is run on the immediate (always with the value
	// sign extended out to an int64_t) as Imm. For example:
	//
	// def immSExt8 : ImmLeaf<i16, [{ return (char)Imm == Imm; }]>;
	//
	// this is a more convenient form to match 'imm' nodes in than PatLeaf and also
	// is preferred over using PatLeaf because it allows the code generator to
	// reason more about the constraint.
	//
	// If FastIsel should ignore all instructions that have an operand of this type,
	// the FastIselShouldIgnore flag can be set. This is an optimization to reduce
	// the code size of the generated fast instruction selector.
	class ImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm,
	SDNode ImmNode = imm>
	: PatFrag<(ops), (vt ImmNode), [{}], xform> {
	let ImmediateCode = pred;
	bit FastIselShouldIgnore = 0;

	// Is the data type of the immediate an APInt?
	bit IsAPInt = 0;

	// Is the data type of the immediate an APFloat?
	bit IsAPFloat = 0;
	}

	// Convenience wrapper for ImmLeaf to use timm/TargetConstant instead
	// of imm/Constant.
	class TImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm,
	SDNode ImmNode = timm> : ImmLeaf<vt, pred, xform, ImmNode>;

	// An ImmLeaf except that Imm is an APInt. This is useful when you need to
	// zero-extend the immediate instead of sign-extend it.
	//
	// Note that FastISel does not currently understand IntImmLeaf and will not
	// generate code for rules that make use of it. As such, it does not make sense
	// to replace ImmLeaf with IntImmLeaf. However, replacing PatLeaf with an
	// IntImmLeaf will allow GlobalISel to import the rule.
	class IntImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
	: ImmLeaf<vt, pred, xform> {
	let IsAPInt = 1;
	let FastIselShouldIgnore = 1;
	}

	// An ImmLeaf except that Imm is an APFloat.
	//
	// Note that FastISel does not currently understand FPImmLeaf and will not
	// generate code for rules that make use of it.
	class FPImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
	: ImmLeaf<vt, pred, xform, fpimm> {
	let IsAPFloat = 1;
	let FastIselShouldIgnore = 1;
	}

	// Leaf fragments.

	def vtInt : PatLeaf<(vt), [{ return N->getVT().isInteger(); }]>;
	def vtFP : PatLeaf<(vt), [{ return N->getVT().isFloatingPoint(); }]>;

	// Use ISD::isBuildVectorAllOnes or ISD::isBuildVectorAllZeros to look for
	// the corresponding build_vector. Will look through bitcasts except when used
	// as a pattern root.
	def immAllOnesV; // ISD::isBuildVectorAllOnes
	def immAllZerosV; // ISD::isBuildVectorAllZeros

	// Other helper fragments.
	def not : PatFrag<(ops node:$in), (xor node:$in, -1)>;
	def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
	def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>;

	// null_frag - The null pattern operator is used in multiclass instantiations
	// which accept an SDPatternOperator for use in matching patterns for internal
	// definitions. When expanding a pattern, if the null fragment is referenced
	// in the expansion, the pattern is discarded and it is as-if '[]' had been
	// specified. This allows multiclasses to have the isel patterns be optional.
	def null_frag : SDPatternOperator;

	// load fragments.
	def unindexedload : PatFrag<(ops node:$ptr), (ld node:$ptr)> {
	let IsLoad = 1;
	let IsUnindexed = 1;
	}
	def load : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
	let IsLoad = 1;
	let IsNonExtLoad = 1;
	}

	// extending load fragments.
	def extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
	let IsLoad = 1;
	let IsAnyExtLoad = 1;
	}
	def sextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
	let IsLoad = 1;
	let IsSignExtLoad = 1;
	}
	def zextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
	let IsLoad = 1;
	let IsZeroExtLoad = 1;
	}

	def extloadi1 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i1;
	}
	def extloadi8 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i8;
	}
	def extloadi16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i16;
	}
	def extloadi32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i32;
	}
	+def extloadf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	+ let IsLoad = 1;
	+ let MemoryVT = f16;
	+}
	def extloadf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = f32;
	}
	def extloadf64 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = f64;
	}

	def sextloadi1 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i1;
	}
	def sextloadi8 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i8;
	}
	def sextloadi16 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i16;
	}
	def sextloadi32 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i32;
	}

	def zextloadi1 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i1;
	}
	def zextloadi8 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i8;
	}
	def zextloadi16 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i16;
	}
	def zextloadi32 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let MemoryVT = i32;
	}

	def extloadvi1 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i1;
	}
	def extloadvi8 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i8;
	}
	def extloadvi16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i16;
	}
	def extloadvi32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i32;
	}
	def extloadvf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = f32;
	}
	def extloadvf64 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = f64;
	}

	def sextloadvi1 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i1;
	}
	def sextloadvi8 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i8;
	}
	def sextloadvi16 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i16;
	}
	def sextloadvi32 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i32;
	}

	def zextloadvi1 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i1;
	}
	def zextloadvi8 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i8;
	}
	def zextloadvi16 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i16;
	}
	def zextloadvi32 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
	let IsLoad = 1;
	let ScalarMemoryVT = i32;
	}

	// store fragments.
	def unindexedstore : PatFrag<(ops node:$val, node:$ptr),
	(st node:$val, node:$ptr)> {
	let IsStore = 1;
	let IsUnindexed = 1;
	}
	def store : PatFrag<(ops node:$val, node:$ptr),
	(unindexedstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let IsTruncStore = 0;
	}

	// truncstore fragments.
	def truncstore : PatFrag<(ops node:$val, node:$ptr),
	(unindexedstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let IsTruncStore = 1;
	}
	def truncstorei8 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let MemoryVT = i8;
	}
	def truncstorei16 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let MemoryVT = i16;
	}
	def truncstorei32 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let MemoryVT = i32;
	+}
	+def truncstoref16 : PatFrag<(ops node:$val, node:$ptr),
	+ (truncstore node:$val, node:$ptr)> {
	+ let IsStore = 1;
	+ let MemoryVT = f16;
	}
	def truncstoref32 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let MemoryVT = f32;
	}
	def truncstoref64 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let MemoryVT = f64;
	}

	def truncstorevi8 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let ScalarMemoryVT = i8;
	}

	def truncstorevi16 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let ScalarMemoryVT = i16;
	}

	def truncstorevi32 : PatFrag<(ops node:$val, node:$ptr),
	(truncstore node:$val, node:$ptr)> {
	let IsStore = 1;
	let ScalarMemoryVT = i32;
	}

	// indexed store fragments.
	def istore : PatFrag<(ops node:$val, node:$base, node:$offset),
	(ist node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let IsTruncStore = 0;
	}

	def pre_store : PatFrag<(ops node:$val, node:$base, node:$offset),
	(istore node:$val, node:$base, node:$offset), [{
	ISD::MemIndexedMode AM = cast<StoreSDNode>(N)->getAddressingMode();
	return AM == ISD::PRE_INC \|\| AM == ISD::PRE_DEC;
	}]>;

	def itruncstore : PatFrag<(ops node:$val, node:$base, node:$offset),
	(ist node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let IsTruncStore = 1;
	}
	def pre_truncst : PatFrag<(ops node:$val, node:$base, node:$offset),
	(itruncstore node:$val, node:$base, node:$offset), [{
	ISD::MemIndexedMode AM = cast<StoreSDNode>(N)->getAddressingMode();
	return AM == ISD::PRE_INC \|\| AM == ISD::PRE_DEC;
	}]>;
	def pre_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i1;
	}
	def pre_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i8;
	}
	def pre_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i16;
	}
	def pre_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i32;
	}
	def pre_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = f32;
	}
	def pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let ScalarMemoryVT = i8;
	}
	def pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(pre_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let ScalarMemoryVT = i16;
	}

	def post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
	(istore node:$val, node:$ptr, node:$offset), [{
	ISD::MemIndexedMode AM = cast<StoreSDNode>(N)->getAddressingMode();
	return AM == ISD::POST_INC \|\| AM == ISD::POST_DEC;
	}]>;

	def post_truncst : PatFrag<(ops node:$val, node:$base, node:$offset),
	(itruncstore node:$val, node:$base, node:$offset), [{
	ISD::MemIndexedMode AM = cast<StoreSDNode>(N)->getAddressingMode();
	return AM == ISD::POST_INC \|\| AM == ISD::POST_DEC;
	}]>;
	def post_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i1;
	}
	def post_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i8;
	}
	def post_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i16;
	}
	def post_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = i32;
	}
	def post_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let MemoryVT = f32;
	}
	def post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let ScalarMemoryVT = i8;
	}
	def post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
	(post_truncst node:$val, node:$base, node:$offset)> {
	let IsStore = 1;
	let ScalarMemoryVT = i16;
	}

	// TODO: Split these into volatile and unordered flavors to enable
	// selectively legal optimizations for each. (See D66309)
	def simple_load : PatFrag<(ops node:$ptr),
	(load node:$ptr), [{
	return cast<LoadSDNode>(N)->isSimple();
	}]>;
	def simple_store : PatFrag<(ops node:$val, node:$ptr),
	(store node:$val, node:$ptr), [{
	return cast<StoreSDNode>(N)->isSimple();
	}]>;

	// nontemporal store fragments.
	def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
	(store node:$val, node:$ptr), [{
	return cast<StoreSDNode>(N)->isNonTemporal();
	}]>;

	def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
	(nontemporalstore node:$val, node:$ptr), [{
	StoreSDNode *St = cast<StoreSDNode>(N);
	return St->getAlignment() >= St->getMemoryVT().getStoreSize();
	}]>;

	def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
	(nontemporalstore node:$val, node:$ptr), [{
	StoreSDNode *St = cast<StoreSDNode>(N);
	return St->getAlignment() < St->getMemoryVT().getStoreSize();
	}]>;

	// nontemporal load fragments.
	def nontemporalload : PatFrag<(ops node:$ptr),
	(load node:$ptr), [{
	return cast<LoadSDNode>(N)->isNonTemporal();
	}]>;

	def alignednontemporalload : PatFrag<(ops node:$ptr),
	(nontemporalload node:$ptr), [{
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
	}]>;

	// setcc convenience fragments.
	def setoeq : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETOEQ)>;
	def setogt : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETOGT)>;
	def setoge : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETOGE)>;
	def setolt : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETOLT)>;
	def setole : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETOLE)>;
	def setone : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETONE)>;
	def seto : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETO)>;
	def setuo : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETUO)>;
	def setueq : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETUEQ)>;
	def setugt : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETUGT)>;
	def setuge : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETUGE)>;
	def setult : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETULT)>;
	def setule : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETULE)>;
	def setune : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETUNE)>;
	def seteq : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETEQ)>;
	def setgt : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETGT)>;
	def setge : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETGE)>;
	def setlt : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETLT)>;
	def setle : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETLE)>;
	def setne : PatFrag<(ops node:$lhs, node:$rhs),
	(setcc node:$lhs, node:$rhs, SETNE)>;

	// We don't have strict FP extended loads as single DAG nodes, but we can
	// still provide convenience fragments to match those operations.
	def strict_extloadf32 : PatFrag<(ops node:$ptr),
	(strict_fpextend (f32 (load node:$ptr)))>;
	def strict_extloadf64 : PatFrag<(ops node:$ptr),
	(strict_fpextend (f64 (load node:$ptr)))>;

	// Convenience fragments to match both strict and non-strict fp operations
	def any_fadd : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fadd node:$lhs, node:$rhs),
	(fadd node:$lhs, node:$rhs)]>;
	def any_fsub : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fsub node:$lhs, node:$rhs),
	(fsub node:$lhs, node:$rhs)]>;
	def any_fmul : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fmul node:$lhs, node:$rhs),
	(fmul node:$lhs, node:$rhs)]>;
	def any_fdiv : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fdiv node:$lhs, node:$rhs),
	(fdiv node:$lhs, node:$rhs)]>;
	def any_frem : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_frem node:$lhs, node:$rhs),
	(frem node:$lhs, node:$rhs)]>;
	def any_fma : PatFrags<(ops node:$src1, node:$src2, node:$src3),
	[(strict_fma node:$src1, node:$src2, node:$src3),
	(fma node:$src1, node:$src2, node:$src3)]>;
	def any_fsqrt : PatFrags<(ops node:$src),
	[(strict_fsqrt node:$src),
	(fsqrt node:$src)]>;
	def any_fsin : PatFrags<(ops node:$src),
	[(strict_fsin node:$src),
	(fsin node:$src)]>;
	def any_fcos : PatFrags<(ops node:$src),
	[(strict_fcos node:$src),
	(fcos node:$src)]>;
	def any_fexp2 : PatFrags<(ops node:$src),
	[(strict_fexp2 node:$src),
	(fexp2 node:$src)]>;
	def any_fpow : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fpow node:$lhs, node:$rhs),
	(fpow node:$lhs, node:$rhs)]>;
	def any_flog2 : PatFrags<(ops node:$src),
	[(strict_flog2 node:$src),
	(flog2 node:$src)]>;
	def any_frint : PatFrags<(ops node:$src),
	[(strict_frint node:$src),
	(frint node:$src)]>;
	def any_lrint : PatFrags<(ops node:$src),
	[(strict_lrint node:$src),
	(lrint node:$src)]>;
	def any_llrint : PatFrags<(ops node:$src),
	[(strict_llrint node:$src),
	(llrint node:$src)]>;
	def any_fnearbyint : PatFrags<(ops node:$src),
	[(strict_fnearbyint node:$src),
	(fnearbyint node:$src)]>;
	def any_fceil : PatFrags<(ops node:$src),
	[(strict_fceil node:$src),
	(fceil node:$src)]>;
	def any_ffloor : PatFrags<(ops node:$src),
	[(strict_ffloor node:$src),
	(ffloor node:$src)]>;
	def any_lround : PatFrags<(ops node:$src),
	[(strict_lround node:$src),
	(lround node:$src)]>;
	def any_llround : PatFrags<(ops node:$src),
	[(strict_llround node:$src),
	(llround node:$src)]>;
	def any_fround : PatFrags<(ops node:$src),
	[(strict_fround node:$src),
	(fround node:$src)]>;
	def any_ftrunc : PatFrags<(ops node:$src),
	[(strict_ftrunc node:$src),
	(ftrunc node:$src)]>;
	def any_fmaxnum : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fmaxnum node:$lhs, node:$rhs),
	(fmaxnum node:$lhs, node:$rhs)]>;
	def any_fminnum : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fminnum node:$lhs, node:$rhs),
	(fminnum node:$lhs, node:$rhs)]>;
	def any_fmaximum : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fmaximum node:$lhs, node:$rhs),
	(fmaximum node:$lhs, node:$rhs)]>;
	def any_fminimum : PatFrags<(ops node:$lhs, node:$rhs),
	[(strict_fminimum node:$lhs, node:$rhs),
	(fminimum node:$lhs, node:$rhs)]>;
	def any_fpround : PatFrags<(ops node:$src),
	[(strict_fpround node:$src),
	(fpround node:$src)]>;
	def any_fpextend : PatFrags<(ops node:$src),
	[(strict_fpextend node:$src),
	(fpextend node:$src)]>;
	def any_extloadf32 : PatFrags<(ops node:$ptr),
	[(strict_extloadf32 node:$ptr),
	(extloadf32 node:$ptr)]>;
	def any_extloadf64 : PatFrags<(ops node:$ptr),
	[(strict_extloadf64 node:$ptr),
	(extloadf64 node:$ptr)]>;
	def any_fp_to_sint : PatFrags<(ops node:$src),
	[(strict_fp_to_sint node:$src),
	(fp_to_sint node:$src)]>;
	def any_fp_to_uint : PatFrags<(ops node:$src),
	[(strict_fp_to_uint node:$src),
	(fp_to_uint node:$src)]>;
	def any_sint_to_fp : PatFrags<(ops node:$src),
	[(strict_sint_to_fp node:$src),
	(sint_to_fp node:$src)]>;
	def any_uint_to_fp : PatFrags<(ops node:$src),
	[(strict_uint_to_fp node:$src),
	(uint_to_fp node:$src)]>;

	multiclass binary_atomic_op_ord<SDNode atomic_op> {
	def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingMonotonic = 1;
	}
	def #NAME#_acquire : PatFrag<(ops node:$ptr, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingAcquire = 1;
	}
	def #NAME#_release : PatFrag<(ops node:$ptr, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingRelease = 1;
	}
	def #NAME#_acq_rel : PatFrag<(ops node:$ptr, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingAcquireRelease = 1;
	}
	def #NAME#_seq_cst : PatFrag<(ops node:$ptr, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingSequentiallyConsistent = 1;
	}
	}

	multiclass ternary_atomic_op_ord<SDNode atomic_op> {
	def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingMonotonic = 1;
	}
	def #NAME#_acquire : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingAcquire = 1;
	}
	def #NAME#_release : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingRelease = 1;
	}
	def #NAME#_acq_rel : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingAcquireRelease = 1;
	}
	def #NAME#_seq_cst : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(!cast<SDPatternOperator>(#NAME) node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let IsAtomicOrderingSequentiallyConsistent = 1;
	}
	}

	multiclass binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
	def _8 : PatFrag<(ops node:$ptr, node:$val),
	(atomic_op node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = !if(IsInt, i8, ?);
	}
	def _16 : PatFrag<(ops node:$ptr, node:$val),
	(atomic_op node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = !if(IsInt, i16, f16);
	}
	def _32 : PatFrag<(ops node:$ptr, node:$val),
	(atomic_op node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = !if(IsInt, i32, f32);
	}
	def _64 : PatFrag<(ops node:$ptr, node:$val),
	(atomic_op node:$ptr, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = !if(IsInt, i64, f64);
	}

	defm NAME#_8 : binary_atomic_op_ord<atomic_op>;
	defm NAME#_16 : binary_atomic_op_ord<atomic_op>;
	defm NAME#_32 : binary_atomic_op_ord<atomic_op>;
	defm NAME#_64 : binary_atomic_op_ord<atomic_op>;
	}

	multiclass ternary_atomic_op<SDNode atomic_op> {
	def _8 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(atomic_op node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = i8;
	}
	def _16 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(atomic_op node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = i16;
	}
	def _32 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(atomic_op node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = i32;
	}
	def _64 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
	(atomic_op node:$ptr, node:$cmp, node:$val)> {
	let IsAtomic = 1;
	let MemoryVT = i64;
	}

	defm NAME#_8 : ternary_atomic_op_ord<atomic_op>;
	defm NAME#_16 : ternary_atomic_op_ord<atomic_op>;
	defm NAME#_32 : ternary_atomic_op_ord<atomic_op>;
	defm NAME#_64 : ternary_atomic_op_ord<atomic_op>;
	}

	defm atomic_load_add : binary_atomic_op<atomic_load_add>;
	defm atomic_swap : binary_atomic_op<atomic_swap>;
	defm atomic_load_sub : binary_atomic_op<atomic_load_sub>;
	defm atomic_load_and : binary_atomic_op<atomic_load_and>;
	defm atomic_load_clr : binary_atomic_op<atomic_load_clr>;
	defm atomic_load_or : binary_atomic_op<atomic_load_or>;
	defm atomic_load_xor : binary_atomic_op<atomic_load_xor>;
	defm atomic_load_nand : binary_atomic_op<atomic_load_nand>;
	defm atomic_load_min : binary_atomic_op<atomic_load_min>;
	defm atomic_load_max : binary_atomic_op<atomic_load_max>;
	defm atomic_load_umin : binary_atomic_op<atomic_load_umin>;
	defm atomic_load_umax : binary_atomic_op<atomic_load_umax>;
	defm atomic_store : binary_atomic_op<atomic_store>;
	defm atomic_cmp_swap : ternary_atomic_op<atomic_cmp_swap>;

	def atomic_load_8 :
	PatFrag<(ops node:$ptr),
	(atomic_load node:$ptr)> {
	let IsAtomic = 1;
	let MemoryVT = i8;
	}
	def atomic_load_16 :
	PatFrag<(ops node:$ptr),
	(atomic_load node:$ptr)> {
	let IsAtomic = 1;
	let MemoryVT = i16;
	}
	def atomic_load_32 :
	PatFrag<(ops node:$ptr),
	(atomic_load node:$ptr)> {
	let IsAtomic = 1;
	let MemoryVT = i32;
	}
	def atomic_load_64 :
	PatFrag<(ops node:$ptr),
	(atomic_load node:$ptr)> {
	let IsAtomic = 1;
	let MemoryVT = i64;
	}

	//===----------------------------------------------------------------------===//
	// Selection DAG Pattern Support.
	//
	// Patterns are what are actually matched against by the target-flavored
	// instruction selection DAG. Instructions defined by the target implicitly
	// define patterns in most cases, but patterns can also be explicitly added when
	// an operation is defined by a sequence of instructions (e.g. loading a large
	// immediate value on RISC targets that do not support immediates as large as
	// their GPRs).
	//

	class Pattern<dag patternToMatch, list<dag> resultInstrs> {
	dag PatternToMatch = patternToMatch;
	list<dag> ResultInstrs = resultInstrs;
	list<Predicate> Predicates = []; // See class Instruction in Target.td.
	int AddedComplexity = 0; // See class Instruction in Target.td.
	}

	// Pat - A simple (but common) form of a pattern, which produces a simple result
	// not needing a full list.
	class Pat<dag pattern, dag result> : Pattern<pattern, [result]>;

	//===----------------------------------------------------------------------===//
	// Complex pattern definitions.
	//

	// Complex patterns, e.g. X86 addressing mode, requires pattern matching code
	// in C++. NumOperands is the number of operands returned by the select function;
	// SelectFunc is the name of the function used to pattern match the max. pattern;
	// RootNodes are the list of possible root nodes of the sub-dags to match.
	// e.g. X86 addressing mode - def addr : ComplexPattern<4, "SelectAddr", [add]>;
	//
	class ComplexPattern<ValueType ty, int numops, string fn,
	list<SDNode> roots = [], list<SDNodeProperty> props = [],
	int complexity = -1> {
	ValueType Ty = ty;
	int NumOperands = numops;
	string SelectFunc = fn;
	list<SDNode> RootNodes = roots;
	list<SDNodeProperty> Properties = props;
	int Complexity = complexity;
	}
	Index: head/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp (revision 362609)
	@@ -0,0 +1,1837 @@
	+//===- RDFGraph.cpp -------------------------------------------------------===//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+//
	+// Target-independent, SSA-based data flow graph for register data flow (RDF).
	+//
	+#include "llvm/ADT/BitVector.h"
	+#include "llvm/ADT/STLExtras.h"
	+#include "llvm/ADT/SetVector.h"
	+#include "llvm/CodeGen/MachineBasicBlock.h"
	+#include "llvm/CodeGen/MachineDominanceFrontier.h"
	+#include "llvm/CodeGen/MachineDominators.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineInstr.h"
	+#include "llvm/CodeGen/MachineOperand.h"
	+#include "llvm/CodeGen/MachineRegisterInfo.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	+#include "llvm/CodeGen/TargetInstrInfo.h"
	+#include "llvm/CodeGen/TargetLowering.h"
	+#include "llvm/CodeGen/TargetRegisterInfo.h"
	+#include "llvm/CodeGen/TargetSubtargetInfo.h"
	+#include "llvm/IR/Function.h"
	+#include "llvm/MC/LaneBitmask.h"
	+#include "llvm/MC/MCInstrDesc.h"
	+#include "llvm/MC/MCRegisterInfo.h"
	+#include "llvm/Support/Debug.h"
	+#include "llvm/Support/ErrorHandling.h"
	+#include "llvm/Support/raw_ostream.h"
	+#include <algorithm>
	+#include <cassert>
	+#include <cstdint>
	+#include <cstring>
	+#include <iterator>
	+#include <set>
	+#include <utility>
	+#include <vector>
	+
	+using namespace llvm;
	+using namespace rdf;
	+
	+// Printing functions. Have them here first, so that the rest of the code
	+// can use them.
	+namespace llvm {
	+namespace rdf {
	+
	+raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) {
	+ if (!P.Mask.all())
	+ OS << ':' << PrintLaneMask(P.Mask);
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
	+ auto &TRI = P.G.getTRI();
	+ if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
	+ OS << TRI.getName(P.Obj.Reg);
	+ else
	+ OS << '#' << P.Obj.Reg;
	+ OS << PrintLaneMaskOpt(P.Obj.Mask);
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
	+ auto NA = P.G.addr<NodeBase*>(P.Obj);
	+ uint16_t Attrs = NA.Addr->getAttrs();
	+ uint16_t Kind = NodeAttrs::kind(Attrs);
	+ uint16_t Flags = NodeAttrs::flags(Attrs);
	+ switch (NodeAttrs::type(Attrs)) {
	+ case NodeAttrs::Code:
	+ switch (Kind) {
	+ case NodeAttrs::Func: OS << 'f'; break;
	+ case NodeAttrs::Block: OS << 'b'; break;
	+ case NodeAttrs::Stmt: OS << 's'; break;
	+ case NodeAttrs::Phi: OS << 'p'; break;
	+ default: OS << "c?"; break;
	+ }
	+ break;
	+ case NodeAttrs::Ref:
	+ if (Flags & NodeAttrs::Undef)
	+ OS << '/';
	+ if (Flags & NodeAttrs::Dead)
	+ OS << '\\';
	+ if (Flags & NodeAttrs::Preserving)
	+ OS << '+';
	+ if (Flags & NodeAttrs::Clobbering)
	+ OS << '~';
	+ switch (Kind) {
	+ case NodeAttrs::Use: OS << 'u'; break;
	+ case NodeAttrs::Def: OS << 'd'; break;
	+ case NodeAttrs::Block: OS << 'b'; break;
	+ default: OS << "r?"; break;
	+ }
	+ break;
	+ default:
	+ OS << '?';
	+ break;
	+ }
	+ OS << P.Obj;
	+ if (Flags & NodeAttrs::Shadow)
	+ OS << '"';
	+ return OS;
	+}
	+
	+static void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
	+ const DataFlowGraph &G) {
	+ OS << Print<NodeId>(RA.Id, G) << '<'
	+ << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
	+ if (RA.Addr->getFlags() & NodeAttrs::Fixed)
	+ OS << '!';
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
	+ printRefHeader(OS, P.Obj, P.G);
	+ OS << '(';
	+ if (NodeId N = P.Obj.Addr->getReachingDef())
	+ OS << Print<NodeId>(N, P.G);
	+ OS << ',';
	+ if (NodeId N = P.Obj.Addr->getReachedDef())
	+ OS << Print<NodeId>(N, P.G);
	+ OS << ',';
	+ if (NodeId N = P.Obj.Addr->getReachedUse())
	+ OS << Print<NodeId>(N, P.G);
	+ OS << "):";
	+ if (NodeId N = P.Obj.Addr->getSibling())
	+ OS << Print<NodeId>(N, P.G);
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
	+ printRefHeader(OS, P.Obj, P.G);
	+ OS << '(';
	+ if (NodeId N = P.Obj.Addr->getReachingDef())
	+ OS << Print<NodeId>(N, P.G);
	+ OS << "):";
	+ if (NodeId N = P.Obj.Addr->getSibling())
	+ OS << Print<NodeId>(N, P.G);
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS,
	+ const Print<NodeAddr<PhiUseNode*>> &P) {
	+ printRefHeader(OS, P.Obj, P.G);
	+ OS << '(';
	+ if (NodeId N = P.Obj.Addr->getReachingDef())
	+ OS << Print<NodeId>(N, P.G);
	+ OS << ',';
	+ if (NodeId N = P.Obj.Addr->getPredecessor())
	+ OS << Print<NodeId>(N, P.G);
	+ OS << "):";
	+ if (NodeId N = P.Obj.Addr->getSibling())
	+ OS << Print<NodeId>(N, P.G);
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
	+ switch (P.Obj.Addr->getKind()) {
	+ case NodeAttrs::Def:
	+ OS << PrintNode<DefNode*>(P.Obj, P.G);
	+ break;
	+ case NodeAttrs::Use:
	+ if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
	+ OS << PrintNode<PhiUseNode*>(P.Obj, P.G);
	+ else
	+ OS << PrintNode<UseNode*>(P.Obj, P.G);
	+ break;
	+ }
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
	+ unsigned N = P.Obj.size();
	+ for (auto I : P.Obj) {
	+ OS << Print<NodeId>(I.Id, P.G);
	+ if (--N)
	+ OS << ' ';
	+ }
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
	+ unsigned N = P.Obj.size();
	+ for (auto I : P.Obj) {
	+ OS << Print<NodeId>(I, P.G);
	+ if (--N)
	+ OS << ' ';
	+ }
	+ return OS;
	+}
	+
	+namespace {
	+
	+ template <typename T>
	+ struct PrintListV {
	+ PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
	+
	+ using Type = T;
	+ const NodeList &List;
	+ const DataFlowGraph &G;
	+ };
	+
	+ template <typename T>
	+ raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) {
	+ unsigned N = P.List.size();
	+ for (NodeAddr<T> A : P.List) {
	+ OS << PrintNode<T>(A, P.G);
	+ if (--N)
	+ OS << ", ";
	+ }
	+ return OS;
	+ }
	+
	+} // end anonymous namespace
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
	+ OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
	+ << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
	+ return OS;
	+}
	+
	+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<StmtNode *>> &P) {
	+ const MachineInstr &MI = *P.Obj.Addr->getCode();
	+ unsigned Opc = MI.getOpcode();
	+ OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc);
	+ // Print the target for calls and branches (for readability).
	+ if (MI.isCall() \|\| MI.isBranch()) {
	+ MachineInstr::const_mop_iterator T =
	+ llvm::find_if(MI.operands(),
	+ [] (const MachineOperand &Op) -> bool {
	+ return Op.isMBB() \|\| Op.isGlobal() \|\| Op.isSymbol();
	+ });
	+ if (T != MI.operands_end()) {
	+ OS << ' ';
	+ if (T->isMBB())
	+ OS << printMBBReference(*T->getMBB());
	+ else if (T->isGlobal())
	+ OS << T->getGlobal()->getName();
	+ else if (T->isSymbol())
	+ OS << T->getSymbolName();
	+ }
	+ }
	+ OS << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS,
	+ const Print<NodeAddr<InstrNode*>> &P) {
	+ switch (P.Obj.Addr->getKind()) {
	+ case NodeAttrs::Phi:
	+ OS << PrintNode<PhiNode*>(P.Obj, P.G);
	+ break;
	+ case NodeAttrs::Stmt:
	+ OS << PrintNode<StmtNode*>(P.Obj, P.G);
	+ break;
	+ default:
	+ OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G);
	+ break;
	+ }
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS,
	+ const Print<NodeAddr<BlockNode*>> &P) {
	+ MachineBasicBlock *BB = P.Obj.Addr->getCode();
	+ unsigned NP = BB->pred_size();
	+ std::vector<int> Ns;
	+ auto PrintBBs = [&OS] (std::vector<int> Ns) -> void {
	+ unsigned N = Ns.size();
	+ for (int I : Ns) {
	+ OS << "%bb." << I;
	+ if (--N)
	+ OS << ", ";
	+ }
	+ };
	+
	+ OS << Print<NodeId>(P.Obj.Id, P.G) << ": --- " << printMBBReference(*BB)
	+ << " --- preds(" << NP << "): ";
	+ for (MachineBasicBlock *B : BB->predecessors())
	+ Ns.push_back(B->getNumber());
	+ PrintBBs(Ns);
	+
	+ unsigned NS = BB->succ_size();
	+ OS << " succs(" << NS << "): ";
	+ Ns.clear();
	+ for (MachineBasicBlock *B : BB->successors())
	+ Ns.push_back(B->getNumber());
	+ PrintBBs(Ns);
	+ OS << '\n';
	+
	+ for (auto I : P.Obj.Addr->members(P.G))
	+ OS << PrintNode<InstrNode*>(I, P.G) << '\n';
	+ return OS;
	+}
	+
	+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<FuncNode *>> &P) {
	+ OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
	+ << P.Obj.Addr->getCode()->getName() << '\n';
	+ for (auto I : P.Obj.Addr->members(P.G))
	+ OS << PrintNode<BlockNode*>(I, P.G) << '\n';
	+ OS << "]\n";
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
	+ OS << '{';
	+ for (auto I : P.Obj)
	+ OS << ' ' << Print<RegisterRef>(I, P.G);
	+ OS << " }";
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterAggr> &P) {
	+ P.Obj.print(OS);
	+ return OS;
	+}
	+
	+raw_ostream &operator<< (raw_ostream &OS,
	+ const Print<DataFlowGraph::DefStack> &P) {
	+ for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
	+ OS << Print<NodeId>(I->Id, P.G)
	+ << '<' << Print<RegisterRef>(I->Addr->getRegRef(P.G), P.G) << '>';
	+ I.down();
	+ if (I != E)
	+ OS << ' ';
	+ }
	+ return OS;
	+}
	+
	+} // end namespace rdf
	+} // end namespace llvm
	+
	+// Node allocation functions.
	+//
	+// Node allocator is like a slab memory allocator: it allocates blocks of
	+// memory in sizes that are multiples of the size of a node. Each block has
	+// the same size. Nodes are allocated from the currently active block, and
	+// when it becomes full, a new one is created.
	+// There is a mapping scheme between node id and its location in a block,
	+// and within that block is described in the header file.
	+//
	+void NodeAllocator::startNewBlock() {
	+ void T = MemPool.Allocate(NodesPerBlockNodeMemSize, NodeMemSize);
	+ char P = static_cast<char>(T);
	+ Blocks.push_back(P);
	+ // Check if the block index is still within the allowed range, i.e. less
	+ // than 2^N, where N is the number of bits in NodeId for the block index.
	+ // BitsPerIndex is the number of bits per node index.
	+ assert((Blocks.size() < ((size_t)1 << (8*sizeof(NodeId)-BitsPerIndex))) &&
	+ "Out of bits for block index");
	+ ActiveEnd = P;
	+}
	+
	+bool NodeAllocator::needNewBlock() {
	+ if (Blocks.empty())
	+ return true;
	+
	+ char *ActiveBegin = Blocks.back();
	+ uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize;
	+ return Index >= NodesPerBlock;
	+}
	+
	+NodeAddr<NodeBase*> NodeAllocator::New() {
	+ if (needNewBlock())
	+ startNewBlock();
	+
	+ uint32_t ActiveB = Blocks.size()-1;
	+ uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize;
	+ NodeAddr<NodeBase> NA = { reinterpret_cast<NodeBase>(ActiveEnd),
	+ makeId(ActiveB, Index) };
	+ ActiveEnd += NodeMemSize;
	+ return NA;
	+}
	+
	+NodeId NodeAllocator::id(const NodeBase *P) const {
	+ uintptr_t A = reinterpret_cast<uintptr_t>(P);
	+ for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
	+ uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
	+ if (A < B \|\| A >= B + NodesPerBlock*NodeMemSize)
	+ continue;
	+ uint32_t Idx = (A-B)/NodeMemSize;
	+ return makeId(i, Idx);
	+ }
	+ llvm_unreachable("Invalid node address");
	+}
	+
	+void NodeAllocator::clear() {
	+ MemPool.Reset();
	+ Blocks.clear();
	+ ActiveEnd = nullptr;
	+}
	+
	+// Insert node NA after "this" in the circular chain.
	+void NodeBase::append(NodeAddr<NodeBase*> NA) {
	+ NodeId Nx = Next;
	+ // If NA is already "next", do nothing.
	+ if (Next != NA.Id) {
	+ Next = NA.Id;
	+ NA.Addr->Next = Nx;
	+ }
	+}
	+
	+// Fundamental node manipulator functions.
	+
	+// Obtain the register reference from a reference node.
	+RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
	+ assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
	+ if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
	+ return G.unpack(Ref.PR);
	+ assert(Ref.Op != nullptr);
	+ return G.makeRegRef(*Ref.Op);
	+}
	+
	+// Set the register reference in the reference node directly (for references
	+// in phi nodes).
	+void RefNode::setRegRef(RegisterRef RR, DataFlowGraph &G) {
	+ assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
	+ assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
	+ Ref.PR = G.pack(RR);
	+}
	+
	+// Set the register reference in the reference node based on a machine
	+// operand (for references in statement nodes).
	+void RefNode::setRegRef(MachineOperand *Op, DataFlowGraph &G) {
	+ assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
	+ assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
	+ (void)G;
	+ Ref.Op = Op;
	+}
	+
	+// Get the owner of a given reference node.
	+NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) {
	+ NodeAddr<NodeBase> NA = G.addr<NodeBase>(getNext());
	+
	+ while (NA.Addr != this) {
	+ if (NA.Addr->getType() == NodeAttrs::Code)
	+ return NA;
	+ NA = G.addr<NodeBase*>(NA.Addr->getNext());
	+ }
	+ llvm_unreachable("No owner in circular list");
	+}
	+
	+// Connect the def node to the reaching def node.
	+void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
	+ Ref.RD = DA.Id;
	+ Ref.Sib = DA.Addr->getReachedDef();
	+ DA.Addr->setReachedDef(Self);
	+}
	+
	+// Connect the use node to the reaching def node.
	+void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
	+ Ref.RD = DA.Id;
	+ Ref.Sib = DA.Addr->getReachedUse();
	+ DA.Addr->setReachedUse(Self);
	+}
	+
	+// Get the first member of the code node.
	+NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const {
	+ if (Code.FirstM == 0)
	+ return NodeAddr<NodeBase*>();
	+ return G.addr<NodeBase*>(Code.FirstM);
	+}
	+
	+// Get the last member of the code node.
	+NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const {
	+ if (Code.LastM == 0)
	+ return NodeAddr<NodeBase*>();
	+ return G.addr<NodeBase*>(Code.LastM);
	+}
	+
	+// Add node NA at the end of the member list of the given code node.
	+void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
	+ NodeAddr<NodeBase*> ML = getLastMember(G);
	+ if (ML.Id != 0) {
	+ ML.Addr->append(NA);
	+ } else {
	+ Code.FirstM = NA.Id;
	+ NodeId Self = G.id(this);
	+ NA.Addr->setNext(Self);
	+ }
	+ Code.LastM = NA.Id;
	+}
	+
	+// Add node NA after member node MA in the given code node.
	+void CodeNode::addMemberAfter(NodeAddr<NodeBase> MA, NodeAddr<NodeBase> NA,
	+ const DataFlowGraph &G) {
	+ MA.Addr->append(NA);
	+ if (Code.LastM == MA.Id)
	+ Code.LastM = NA.Id;
	+}
	+
	+// Remove member node NA from the given code node.
	+void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
	+ NodeAddr<NodeBase*> MA = getFirstMember(G);
	+ assert(MA.Id != 0);
	+
	+ // Special handling if the member to remove is the first member.
	+ if (MA.Id == NA.Id) {
	+ if (Code.LastM == MA.Id) {
	+ // If it is the only member, set both first and last to 0.
	+ Code.FirstM = Code.LastM = 0;
	+ } else {
	+ // Otherwise, advance the first member.
	+ Code.FirstM = MA.Addr->getNext();
	+ }
	+ return;
	+ }
	+
	+ while (MA.Addr != this) {
	+ NodeId MX = MA.Addr->getNext();
	+ if (MX == NA.Id) {
	+ MA.Addr->setNext(NA.Addr->getNext());
	+ // If the member to remove happens to be the last one, update the
	+ // LastM indicator.
	+ if (Code.LastM == NA.Id)
	+ Code.LastM = MA.Id;
	+ return;
	+ }
	+ MA = G.addr<NodeBase*>(MX);
	+ }
	+ llvm_unreachable("No such member");
	+}
	+
	+// Return the list of all members of the code node.
	+NodeList CodeNode::members(const DataFlowGraph &G) const {
	+ static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; };
	+ return members_if(True, G);
	+}
	+
	+// Return the owner of the given instr node.
	+NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) {
	+ NodeAddr<NodeBase> NA = G.addr<NodeBase>(getNext());
	+
	+ while (NA.Addr != this) {
	+ assert(NA.Addr->getType() == NodeAttrs::Code);
	+ if (NA.Addr->getKind() == NodeAttrs::Block)
	+ return NA;
	+ NA = G.addr<NodeBase*>(NA.Addr->getNext());
	+ }
	+ llvm_unreachable("No owner in circular list");
	+}
	+
	+// Add the phi node PA to the given block node.
	+void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) {
	+ NodeAddr<NodeBase*> M = getFirstMember(G);
	+ if (M.Id == 0) {
	+ addMember(PA, G);
	+ return;
	+ }
	+
	+ assert(M.Addr->getType() == NodeAttrs::Code);
	+ if (M.Addr->getKind() == NodeAttrs::Stmt) {
	+ // If the first member of the block is a statement, insert the phi as
	+ // the first member.
	+ Code.FirstM = PA.Id;
	+ PA.Addr->setNext(M.Id);
	+ } else {
	+ // If the first member is a phi, find the last phi, and append PA to it.
	+ assert(M.Addr->getKind() == NodeAttrs::Phi);
	+ NodeAddr<NodeBase*> MN = M;
	+ do {
	+ M = MN;
	+ MN = G.addr<NodeBase*>(M.Addr->getNext());
	+ assert(MN.Addr->getType() == NodeAttrs::Code);
	+ } while (MN.Addr->getKind() == NodeAttrs::Phi);
	+
	+ // M is the last phi.
	+ addMemberAfter(M, PA, G);
	+ }
	+}
	+
	+// Find the block node corresponding to the machine basic block BB in the
	+// given func node.
	+NodeAddr<BlockNode> FuncNode::findBlock(const MachineBasicBlock BB,
	+ const DataFlowGraph &G) const {
	+ auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool {
	+ return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB;
	+ };
	+ NodeList Ms = members_if(EqBB, G);
	+ if (!Ms.empty())
	+ return Ms[0];
	+ return NodeAddr<BlockNode*>();
	+}
	+
	+// Get the block node for the entry block in the given function.
	+NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
	+ MachineBasicBlock *EntryB = &getCode()->front();
	+ return findBlock(EntryB, G);
	+}
	+
	+// Target operand information.
	+//
	+
	+// For a given instruction, check if there are any bits of RR that can remain
	+// unchanged across this def.
	+bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
	+ const {
	+ return TII.isPredicated(In);
	+}
	+
	+// Check if the definition of RR produces an unspecified value.
	+bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
	+ const {
	+ const MachineOperand &Op = In.getOperand(OpNum);
	+ if (Op.isRegMask())
	+ return true;
	+ assert(Op.isReg());
	+ if (In.isCall())
	+ if (Op.isDef() && Op.isDead())
	+ return true;
	+ return false;
	+}
	+
	+// Check if the given instruction specifically requires
	+bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
	+ const {
	+ if (In.isCall() \|\| In.isReturn() \|\| In.isInlineAsm())
	+ return true;
	+ // Check for a tail call.
	+ if (In.isBranch())
	+ for (const MachineOperand &O : In.operands())
	+ if (O.isGlobal() \|\| O.isSymbol())
	+ return true;
	+
	+ const MCInstrDesc &D = In.getDesc();
	+ if (!D.getImplicitDefs() && !D.getImplicitUses())
	+ return false;
	+ const MachineOperand &Op = In.getOperand(OpNum);
	+ // If there is a sub-register, treat the operand as non-fixed. Currently,
	+ // fixed registers are those that are listed in the descriptor as implicit
	+ // uses or defs, and those lists do not allow sub-registers.
	+ if (Op.getSubReg() != 0)
	+ return false;
	+ Register Reg = Op.getReg();
	+ const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
	+ : D.getImplicitUses();
	+ if (!ImpR)
	+ return false;
	+ while (*ImpR)
	+ if (*ImpR++ == Reg)
	+ return true;
	+ return false;
	+}
	+
	+//
	+// The data flow graph construction.
	+//
	+
	+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
	+ const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
	+ const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
	+ : MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(toi),
	+ LiveIns(PRI) {
	+}
	+
	+// The implementation of the definition stack.
	+// Each register reference has its own definition stack. In particular,
	+// for a register references "Reg" and "Reg:subreg" will each have their
	+// own definition stacks.
	+
	+// Construct a stack iterator.
	+DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
	+ bool Top) : DS(S) {
	+ if (!Top) {
	+ // Initialize to bottom.
	+ Pos = 0;
	+ return;
	+ }
	+ // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
	+ Pos = DS.Stack.size();
	+ while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1]))
	+ Pos--;
	+}
	+
	+// Return the size of the stack, including block delimiters.
	+unsigned DataFlowGraph::DefStack::size() const {
	+ unsigned S = 0;
	+ for (auto I = top(), E = bottom(); I != E; I.down())
	+ S++;
	+ return S;
	+}
	+
	+// Remove the top entry from the stack. Remove all intervening delimiters
	+// so that after this, the stack is either empty, or the top of the stack
	+// is a non-delimiter.
	+void DataFlowGraph::DefStack::pop() {
	+ assert(!empty());
	+ unsigned P = nextDown(Stack.size());
	+ Stack.resize(P);
	+}
	+
	+// Push a delimiter for block node N on the stack.
	+void DataFlowGraph::DefStack::start_block(NodeId N) {
	+ assert(N != 0);
	+ Stack.push_back(NodeAddr<DefNode*>(nullptr, N));
	+}
	+
	+// Remove all nodes from the top of the stack, until the delimited for
	+// block node N is encountered. Remove the delimiter as well. In effect,
	+// this will remove from the stack all definitions from block N.
	+void DataFlowGraph::DefStack::clear_block(NodeId N) {
	+ assert(N != 0);
	+ unsigned P = Stack.size();
	+ while (P > 0) {
	+ bool Found = isDelimiter(Stack[P-1], N);
	+ P--;
	+ if (Found)
	+ break;
	+ }
	+ // This will also remove the delimiter, if found.
	+ Stack.resize(P);
	+}
	+
	+// Move the stack iterator up by one.
	+unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
	+ // Get the next valid position after P (skipping all delimiters).
	+ // The input position P does not have to point to a non-delimiter.
	+ unsigned SS = Stack.size();
	+ bool IsDelim;
	+ assert(P < SS);
	+ do {
	+ P++;
	+ IsDelim = isDelimiter(Stack[P-1]);
	+ } while (P < SS && IsDelim);
	+ assert(!IsDelim);
	+ return P;
	+}
	+
	+// Move the stack iterator down by one.
	+unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
	+ // Get the preceding valid position before P (skipping all delimiters).
	+ // The input position P does not have to point to a non-delimiter.
	+ assert(P > 0 && P <= Stack.size());
	+ bool IsDelim = isDelimiter(Stack[P-1]);
	+ do {
	+ if (--P == 0)
	+ break;
	+ IsDelim = isDelimiter(Stack[P-1]);
	+ } while (P > 0 && IsDelim);
	+ assert(!IsDelim);
	+ return P;
	+}
	+
	+// Register information.
	+
	+RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
	+ RegisterSet LR;
	+ const Function &F = MF.getFunction();
	+ const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn()
	+ : nullptr;
	+ const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
	+ if (RegisterId R = TLI.getExceptionPointerRegister(PF))
	+ LR.insert(RegisterRef(R));
	+ if (!isFuncletEHPersonality(classifyEHPersonality(PF))) {
	+ if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
	+ LR.insert(RegisterRef(R));
	+ }
	+ return LR;
	+}
	+
	+// Node management functions.
	+
	+// Get the pointer to the node with the id N.
	+NodeBase *DataFlowGraph::ptr(NodeId N) const {
	+ if (N == 0)
	+ return nullptr;
	+ return Memory.ptr(N);
	+}
	+
	+// Get the id of the node at the address P.
	+NodeId DataFlowGraph::id(const NodeBase *P) const {
	+ if (P == nullptr)
	+ return 0;
	+ return Memory.id(P);
	+}
	+
	+// Allocate a new node and set the attributes to Attrs.
	+NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) {
	+ NodeAddr<NodeBase*> P = Memory.New();
	+ P.Addr->init();
	+ P.Addr->setAttrs(Attrs);
	+ return P;
	+}
	+
	+// Make a copy of the given node B, except for the data-flow links, which
	+// are set to 0.
	+NodeAddr<NodeBase> DataFlowGraph::cloneNode(const NodeAddr<NodeBase> B) {
	+ NodeAddr<NodeBase*> NA = newNode(0);
	+ memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
	+ // Ref nodes need to have the data-flow links reset.
	+ if (NA.Addr->getType() == NodeAttrs::Ref) {
	+ NodeAddr<RefNode*> RA = NA;
	+ RA.Addr->setReachingDef(0);
	+ RA.Addr->setSibling(0);
	+ if (NA.Addr->getKind() == NodeAttrs::Def) {
	+ NodeAddr<DefNode*> DA = NA;
	+ DA.Addr->setReachedDef(0);
	+ DA.Addr->setReachedUse(0);
	+ }
	+ }
	+ return NA;
	+}
	+
	+// Allocation routines for specific node types/kinds.
	+
	+NodeAddr<UseNode> DataFlowGraph::newUse(NodeAddr<InstrNode> Owner,
	+ MachineOperand &Op, uint16_t Flags) {
	+ NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref \| NodeAttrs::Use \| Flags);
	+ UA.Addr->setRegRef(&Op, *this);
	+ return UA;
	+}
	+
	+NodeAddr<PhiUseNode> DataFlowGraph::newPhiUse(NodeAddr<PhiNode> Owner,
	+ RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) {
	+ NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref \| NodeAttrs::Use \| Flags);
	+ assert(Flags & NodeAttrs::PhiRef);
	+ PUA.Addr->setRegRef(RR, *this);
	+ PUA.Addr->setPredecessor(PredB.Id);
	+ return PUA;
	+}
	+
	+NodeAddr<DefNode> DataFlowGraph::newDef(NodeAddr<InstrNode> Owner,
	+ MachineOperand &Op, uint16_t Flags) {
	+ NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref \| NodeAttrs::Def \| Flags);
	+ DA.Addr->setRegRef(&Op, *this);
	+ return DA;
	+}
	+
	+NodeAddr<DefNode> DataFlowGraph::newDef(NodeAddr<InstrNode> Owner,
	+ RegisterRef RR, uint16_t Flags) {
	+ NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref \| NodeAttrs::Def \| Flags);
	+ assert(Flags & NodeAttrs::PhiRef);
	+ DA.Addr->setRegRef(RR, *this);
	+ return DA;
	+}
	+
	+NodeAddr<PhiNode> DataFlowGraph::newPhi(NodeAddr<BlockNode> Owner) {
	+ NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code \| NodeAttrs::Phi);
	+ Owner.Addr->addPhi(PA, *this);
	+ return PA;
	+}
	+
	+NodeAddr<StmtNode> DataFlowGraph::newStmt(NodeAddr<BlockNode> Owner,
	+ MachineInstr *MI) {
	+ NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code \| NodeAttrs::Stmt);
	+ SA.Addr->setCode(MI);
	+ Owner.Addr->addMember(SA, *this);
	+ return SA;
	+}
	+
	+NodeAddr<BlockNode> DataFlowGraph::newBlock(NodeAddr<FuncNode> Owner,
	+ MachineBasicBlock *BB) {
	+ NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code \| NodeAttrs::Block);
	+ BA.Addr->setCode(BB);
	+ Owner.Addr->addMember(BA, *this);
	+ return BA;
	+}
	+
	+NodeAddr<FuncNode> DataFlowGraph::newFunc(MachineFunction MF) {
	+ NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code \| NodeAttrs::Func);
	+ FA.Addr->setCode(MF);
	+ return FA;
	+}
	+
	+// Build the data flow graph.
	+void DataFlowGraph::build(unsigned Options) {
	+ reset();
	+ Func = newFunc(&MF);
	+
	+ if (MF.empty())
	+ return;
	+
	+ for (MachineBasicBlock &B : MF) {
	+ NodeAddr<BlockNode*> BA = newBlock(Func, &B);
	+ BlockNodes.insert(std::make_pair(&B, BA));
	+ for (MachineInstr &I : B) {
	+ if (I.isDebugInstr())
	+ continue;
	+ buildStmt(BA, I);
	+ }
	+ }
	+
	+ NodeAddr<BlockNode> EA = Func.Addr->getEntryBlock(this);
	+ NodeList Blocks = Func.Addr->members(*this);
	+
	+ // Collect information about block references.
	+ RegisterSet AllRefs;
	+ for (NodeAddr<BlockNode*> BA : Blocks)
	+ for (NodeAddr<InstrNode> IA : BA.Addr->members(this))
	+ for (NodeAddr<RefNode> RA : IA.Addr->members(this))
	+ AllRefs.insert(RA.Addr->getRegRef(*this));
	+
	+ // Collect function live-ins and entry block live-ins.
	+ MachineRegisterInfo &MRI = MF.getRegInfo();
	+ MachineBasicBlock &EntryB = *EA.Addr->getCode();
	+ assert(EntryB.pred_empty() && "Function entry block has predecessors");
	+ for (std::pair<unsigned,unsigned> P : MRI.liveins())
	+ LiveIns.insert(RegisterRef(P.first));
	+ if (MRI.tracksLiveness()) {
	+ for (auto I : EntryB.liveins())
	+ LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
	+ }
	+
	+ // Add function-entry phi nodes for the live-in registers.
	+ //for (std::pair<RegisterId,LaneBitmask> P : LiveIns) {
	+ for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
	+ RegisterRef RR = *I;
	+ NodeAddr<PhiNode*> PA = newPhi(EA);
	+ uint16_t PhiFlags = NodeAttrs::PhiRef \| NodeAttrs::Preserving;
	+ NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
	+ PA.Addr->addMember(DA, *this);
	+ }
	+
	+ // Add phis for landing pads.
	+ // Landing pads, unlike usual backs blocks, are not entered through
	+ // branches in the program, or fall-throughs from other blocks. They
	+ // are entered from the exception handling runtime and target's ABI
	+ // may define certain registers as defined on entry to such a block.
	+ RegisterSet EHRegs = getLandingPadLiveIns();
	+ if (!EHRegs.empty()) {
	+ for (NodeAddr<BlockNode*> BA : Blocks) {
	+ const MachineBasicBlock &B = *BA.Addr->getCode();
	+ if (!B.isEHPad())
	+ continue;
	+
	+ // Prepare a list of NodeIds of the block's predecessors.
	+ NodeList Preds;
	+ for (MachineBasicBlock *PB : B.predecessors())
	+ Preds.push_back(findBlock(PB));
	+
	+ // Build phi nodes for each live-in.
	+ for (RegisterRef RR : EHRegs) {
	+ NodeAddr<PhiNode*> PA = newPhi(BA);
	+ uint16_t PhiFlags = NodeAttrs::PhiRef \| NodeAttrs::Preserving;
	+ // Add def:
	+ NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
	+ PA.Addr->addMember(DA, *this);
	+ // Add uses (no reaching defs for phi uses):
	+ for (NodeAddr<BlockNode*> PBA : Preds) {
	+ NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
	+ PA.Addr->addMember(PUA, *this);
	+ }
	+ }
	+ }
	+ }
	+
	+ // Build a map "PhiM" which will contain, for each block, the set
	+ // of references that will require phi definitions in that block.
	+ BlockRefsMap PhiM;
	+ for (NodeAddr<BlockNode*> BA : Blocks)
	+ recordDefsForDF(PhiM, BA);
	+ for (NodeAddr<BlockNode*> BA : Blocks)
	+ buildPhis(PhiM, AllRefs, BA);
	+
	+ // Link all the refs. This will recursively traverse the dominator tree.
	+ DefStackMap DM;
	+ linkBlockRefs(DM, EA);
	+
	+ // Finally, remove all unused phi nodes.
	+ if (!(Options & BuildOptions::KeepDeadPhis))
	+ removeUnusedPhis();
	+}
	+
	+RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
	+ assert(PhysicalRegisterInfo::isRegMaskId(Reg) \|\|
	+ Register::isPhysicalRegister(Reg));
	+ assert(Reg != 0);
	+ if (Sub != 0)
	+ Reg = TRI.getSubReg(Reg, Sub);
	+ return RegisterRef(Reg);
	+}
	+
	+RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
	+ assert(Op.isReg() \|\| Op.isRegMask());
	+ if (Op.isReg())
	+ return makeRegRef(Op.getReg(), Op.getSubReg());
	+ return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll());
	+}
	+
	+RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
	+ if (AR.Reg == BR.Reg) {
	+ LaneBitmask M = AR.Mask & BR.Mask;
	+ return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
	+ }
	+#ifndef NDEBUG
	+// RegisterRef NAR = PRI.normalize(AR);
	+// RegisterRef NBR = PRI.normalize(BR);
	+// assert(NAR.Reg != NBR.Reg);
	+#endif
	+ // This isn't strictly correct, because the overlap may happen in the
	+ // part masked out.
	+ if (PRI.alias(AR, BR))
	+ return AR;
	+ return RegisterRef();
	+}
	+
	+// For each stack in the map DefM, push the delimiter for block B on it.
	+void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
	+ // Push block delimiters.
	+ for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
	+ I->second.start_block(B);
	+}
	+
	+// Remove all definitions coming from block B from each stack in DefM.
	+void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
	+ // Pop all defs from this block from the definition stack. Defs that were
	+ // added to the map during the traversal of instructions will not have a
	+ // delimiter, but for those, the whole stack will be emptied.
	+ for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
	+ I->second.clear_block(B);
	+
	+ // Finally, remove empty stacks from the map.
	+ for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
	+ NextI = std::next(I);
	+ // This preserves the validity of iterators other than I.
	+ if (I->second.empty())
	+ DefM.erase(I);
	+ }
	+}
	+
	+// Push all definitions from the instruction node IA to an appropriate
	+// stack in DefM.
	+void DataFlowGraph::pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
	+ pushClobbers(IA, DefM);
	+ pushDefs(IA, DefM);
	+}
	+
	+// Push all definitions from the instruction node IA to an appropriate
	+// stack in DefM.
	+void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
	+ NodeSet Visited;
	+ std::set<RegisterId> Defined;
	+
	+ // The important objectives of this function are:
	+ // - to be able to handle instructions both while the graph is being
	+ // constructed, and after the graph has been constructed, and
	+ // - maintain proper ordering of definitions on the stack for each
	+ // register reference:
	+ // - if there are two or more related defs in IA (i.e. coming from
	+ // the same machine operand), then only push one def on the stack,
	+ // - if there are multiple unrelated defs of non-overlapping
	+ // subregisters of S, then the stack for S will have both (in an
	+ // unspecified order), but the order does not matter from the data-
	+ // -flow perspective.
	+
	+ for (NodeAddr<DefNode> DA : IA.Addr->members_if(IsDef, this)) {
	+ if (Visited.count(DA.Id))
	+ continue;
	+ if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering))
	+ continue;
	+
	+ NodeList Rel = getRelatedRefs(IA, DA);
	+ NodeAddr<DefNode*> PDA = Rel.front();
	+ RegisterRef RR = PDA.Addr->getRegRef(*this);
	+
	+ // Push the definition on the stack for the register and all aliases.
	+ // The def stack traversal in linkNodeUp will check the exact aliasing.
	+ DefM[RR.Reg].push(DA);
	+ Defined.insert(RR.Reg);
	+ for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
	+ // Check that we don't push the same def twice.
	+ assert(A != RR.Reg);
	+ if (!Defined.count(A))
	+ DefM[A].push(DA);
	+ }
	+ // Mark all the related defs as visited.
	+ for (NodeAddr<NodeBase*> T : Rel)
	+ Visited.insert(T.Id);
	+ }
	+}
	+
	+// Push all definitions from the instruction node IA to an appropriate
	+// stack in DefM.
	+void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
	+ NodeSet Visited;
	+#ifndef NDEBUG
	+ std::set<RegisterId> Defined;
	+#endif
	+
	+ // The important objectives of this function are:
	+ // - to be able to handle instructions both while the graph is being
	+ // constructed, and after the graph has been constructed, and
	+ // - maintain proper ordering of definitions on the stack for each
	+ // register reference:
	+ // - if there are two or more related defs in IA (i.e. coming from
	+ // the same machine operand), then only push one def on the stack,
	+ // - if there are multiple unrelated defs of non-overlapping
	+ // subregisters of S, then the stack for S will have both (in an
	+ // unspecified order), but the order does not matter from the data-
	+ // -flow perspective.
	+
	+ for (NodeAddr<DefNode> DA : IA.Addr->members_if(IsDef, this)) {
	+ if (Visited.count(DA.Id))
	+ continue;
	+ if (DA.Addr->getFlags() & NodeAttrs::Clobbering)
	+ continue;
	+
	+ NodeList Rel = getRelatedRefs(IA, DA);
	+ NodeAddr<DefNode*> PDA = Rel.front();
	+ RegisterRef RR = PDA.Addr->getRegRef(*this);
	+#ifndef NDEBUG
	+ // Assert if the register is defined in two or more unrelated defs.
	+ // This could happen if there are two or more def operands defining it.
	+ if (!Defined.insert(RR.Reg).second) {
	+ MachineInstr MI = NodeAddr<StmtNode>(IA).Addr->getCode();
	+ dbgs() << "Multiple definitions of register: "
	+ << Print<RegisterRef>(RR, this) << " in\n " << MI << "in "
	+ << printMBBReference(*MI->getParent()) << '\n';
	+ llvm_unreachable(nullptr);
	+ }
	+#endif
	+ // Push the definition on the stack for the register and all aliases.
	+ // The def stack traversal in linkNodeUp will check the exact aliasing.
	+ DefM[RR.Reg].push(DA);
	+ for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
	+ // Check that we don't push the same def twice.
	+ assert(A != RR.Reg);
	+ DefM[A].push(DA);
	+ }
	+ // Mark all the related defs as visited.
	+ for (NodeAddr<NodeBase*> T : Rel)
	+ Visited.insert(T.Id);
	+ }
	+}
	+
	+// Return the list of all reference nodes related to RA, including RA itself.
	+// See "getNextRelated" for the meaning of a "related reference".
	+NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
	+ NodeAddr<RefNode*> RA) const {
	+ assert(IA.Id != 0 && RA.Id != 0);
	+
	+ NodeList Refs;
	+ NodeId Start = RA.Id;
	+ do {
	+ Refs.push_back(RA);
	+ RA = getNextRelated(IA, RA);
	+ } while (RA.Id != 0 && RA.Id != Start);
	+ return Refs;
	+}
	+
	+// Clear all information in the graph.
	+void DataFlowGraph::reset() {
	+ Memory.clear();
	+ BlockNodes.clear();
	+ Func = NodeAddr<FuncNode*>();
	+}
	+
	+// Return the next reference node in the instruction node IA that is related
	+// to RA. Conceptually, two reference nodes are related if they refer to the
	+// same instance of a register access, but differ in flags or other minor
	+// characteristics. Specific examples of related nodes are shadow reference
	+// nodes.
	+// Return the equivalent of nullptr if there are no more related references.
	+NodeAddr<RefNode> DataFlowGraph::getNextRelated(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA) const {
	+ assert(IA.Id != 0 && RA.Id != 0);
	+
	+ auto Related = [this,RA](NodeAddr<RefNode*> TA) -> bool {
	+ if (TA.Addr->getKind() != RA.Addr->getKind())
	+ return false;
	+ if (TA.Addr->getRegRef(this) != RA.Addr->getRegRef(this))
	+ return false;
	+ return true;
	+ };
	+ auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
	+ return Related(TA) &&
	+ &RA.Addr->getOp() == &TA.Addr->getOp();
	+ };
	+ auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
	+ if (!Related(TA))
	+ return false;
	+ if (TA.Addr->getKind() != NodeAttrs::Use)
	+ return true;
	+ // For phi uses, compare predecessor blocks.
	+ const NodeAddr<const PhiUseNode*> TUA = TA;
	+ const NodeAddr<const PhiUseNode*> RUA = RA;
	+ return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor();
	+ };
	+
	+ RegisterRef RR = RA.Addr->getRegRef(*this);
	+ if (IA.Addr->getKind() == NodeAttrs::Stmt)
	+ return RA.Addr->getNextRef(RR, RelatedStmt, true, *this);
	+ return RA.Addr->getNextRef(RR, RelatedPhi, true, *this);
	+}
	+
	+// Find the next node related to RA in IA that satisfies condition P.
	+// If such a node was found, return a pair where the second element is the
	+// located node. If such a node does not exist, return a pair where the
	+// first element is the element after which such a node should be inserted,
	+// and the second element is a null-address.
	+template <typename Predicate>
	+std::pair<NodeAddr<RefNode>,NodeAddr<RefNode>>
	+DataFlowGraph::locateNextRef(NodeAddr<InstrNode> IA, NodeAddr<RefNode> RA,
	+ Predicate P) const {
	+ assert(IA.Id != 0 && RA.Id != 0);
	+
	+ NodeAddr<RefNode*> NA;
	+ NodeId Start = RA.Id;
	+ while (true) {
	+ NA = getNextRelated(IA, RA);
	+ if (NA.Id == 0 \|\| NA.Id == Start)
	+ break;
	+ if (P(NA))
	+ break;
	+ RA = NA;
	+ }
	+
	+ if (NA.Id != 0 && NA.Id != Start)
	+ return std::make_pair(RA, NA);
	+ return std::make_pair(RA, NodeAddr<RefNode*>());
	+}
	+
	+// Get the next shadow node in IA corresponding to RA, and optionally create
	+// such a node if it does not exist.
	+NodeAddr<RefNode> DataFlowGraph::getNextShadow(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA, bool Create) {
	+ assert(IA.Id != 0 && RA.Id != 0);
	+
	+ uint16_t Flags = RA.Addr->getFlags() \| NodeAttrs::Shadow;
	+ auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
	+ return TA.Addr->getFlags() == Flags;
	+ };
	+ auto Loc = locateNextRef(IA, RA, IsShadow);
	+ if (Loc.second.Id != 0 \|\| !Create)
	+ return Loc.second;
	+
	+ // Create a copy of RA and mark is as shadow.
	+ NodeAddr<RefNode*> NA = cloneNode(RA);
	+ NA.Addr->setFlags(Flags \| NodeAttrs::Shadow);
	+ IA.Addr->addMemberAfter(Loc.first, NA, *this);
	+ return NA;
	+}
	+
	+// Get the next shadow node in IA corresponding to RA. Return null-address
	+// if such a node does not exist.
	+NodeAddr<RefNode> DataFlowGraph::getNextShadow(NodeAddr<InstrNode> IA,
	+ NodeAddr<RefNode*> RA) const {
	+ assert(IA.Id != 0 && RA.Id != 0);
	+ uint16_t Flags = RA.Addr->getFlags() \| NodeAttrs::Shadow;
	+ auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
	+ return TA.Addr->getFlags() == Flags;
	+ };
	+ return locateNextRef(IA, RA, IsShadow).second;
	+}
	+
	+// Create a new statement node in the block node BA that corresponds to
	+// the machine instruction MI.
	+void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
	+ NodeAddr<StmtNode*> SA = newStmt(BA, &In);
	+
	+ auto isCall = [] (const MachineInstr &In) -> bool {
	+ if (In.isCall())
	+ return true;
	+ // Is tail call?
	+ if (In.isBranch()) {
	+ for (const MachineOperand &Op : In.operands())
	+ if (Op.isGlobal() \|\| Op.isSymbol())
	+ return true;
	+ // Assume indirect branches are calls. This is for the purpose of
	+ // keeping implicit operands, and so it won't hurt on intra-function
	+ // indirect branches.
	+ if (In.isIndirectBranch())
	+ return true;
	+ }
	+ return false;
	+ };
	+
	+ auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool {
	+ // This instruction defines DR. Check if there is a use operand that
	+ // would make DR live on entry to the instruction.
	+ for (const MachineOperand &Op : In.operands()) {
	+ if (!Op.isReg() \|\| Op.getReg() == 0 \|\| !Op.isUse() \|\| Op.isUndef())
	+ continue;
	+ RegisterRef UR = makeRegRef(Op);
	+ if (PRI.alias(DR, UR))
	+ return false;
	+ }
	+ return true;
	+ };
	+
	+ bool IsCall = isCall(In);
	+ unsigned NumOps = In.getNumOperands();
	+
	+ // Avoid duplicate implicit defs. This will not detect cases of implicit
	+ // defs that define registers that overlap, but it is not clear how to
	+ // interpret that in the absence of explicit defs. Overlapping explicit
	+ // defs are likely illegal already.
	+ BitVector DoneDefs(TRI.getNumRegs());
	+ // Process explicit defs first.
	+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	+ MachineOperand &Op = In.getOperand(OpN);
	+ if (!Op.isReg() \|\| !Op.isDef() \|\| Op.isImplicit())
	+ continue;
	+ Register R = Op.getReg();
	+ if (!R \|\| !Register::isPhysicalRegister(R))
	+ continue;
	+ uint16_t Flags = NodeAttrs::None;
	+ if (TOI.isPreserving(In, OpN)) {
	+ Flags \|= NodeAttrs::Preserving;
	+ // If the def is preserving, check if it is also undefined.
	+ if (isDefUndef(In, makeRegRef(Op)))
	+ Flags \|= NodeAttrs::Undef;
	+ }
	+ if (TOI.isClobbering(In, OpN))
	+ Flags \|= NodeAttrs::Clobbering;
	+ if (TOI.isFixedReg(In, OpN))
	+ Flags \|= NodeAttrs::Fixed;
	+ if (IsCall && Op.isDead())
	+ Flags \|= NodeAttrs::Dead;
	+ NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
	+ SA.Addr->addMember(DA, *this);
	+ assert(!DoneDefs.test(R));
	+ DoneDefs.set(R);
	+ }
	+
	+ // Process reg-masks (as clobbers).
	+ BitVector DoneClobbers(TRI.getNumRegs());
	+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	+ MachineOperand &Op = In.getOperand(OpN);
	+ if (!Op.isRegMask())
	+ continue;
	+ uint16_t Flags = NodeAttrs::Clobbering \| NodeAttrs::Fixed \|
	+ NodeAttrs::Dead;
	+ NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
	+ SA.Addr->addMember(DA, *this);
	+ // Record all clobbered registers in DoneDefs.
	+ const uint32_t *RM = Op.getRegMask();
	+ for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i)
	+ if (!(RM[i/32] & (1u << (i%32))))
	+ DoneClobbers.set(i);
	+ }
	+
	+ // Process implicit defs, skipping those that have already been added
	+ // as explicit.
	+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	+ MachineOperand &Op = In.getOperand(OpN);
	+ if (!Op.isReg() \|\| !Op.isDef() \|\| !Op.isImplicit())
	+ continue;
	+ Register R = Op.getReg();
	+ if (!R \|\| !Register::isPhysicalRegister(R) \|\| DoneDefs.test(R))
	+ continue;
	+ RegisterRef RR = makeRegRef(Op);
	+ uint16_t Flags = NodeAttrs::None;
	+ if (TOI.isPreserving(In, OpN)) {
	+ Flags \|= NodeAttrs::Preserving;
	+ // If the def is preserving, check if it is also undefined.
	+ if (isDefUndef(In, RR))
	+ Flags \|= NodeAttrs::Undef;
	+ }
	+ if (TOI.isClobbering(In, OpN))
	+ Flags \|= NodeAttrs::Clobbering;
	+ if (TOI.isFixedReg(In, OpN))
	+ Flags \|= NodeAttrs::Fixed;
	+ if (IsCall && Op.isDead()) {
	+ if (DoneClobbers.test(R))
	+ continue;
	+ Flags \|= NodeAttrs::Dead;
	+ }
	+ NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
	+ SA.Addr->addMember(DA, *this);
	+ DoneDefs.set(R);
	+ }
	+
	+ for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	+ MachineOperand &Op = In.getOperand(OpN);
	+ if (!Op.isReg() \|\| !Op.isUse())
	+ continue;
	+ Register R = Op.getReg();
	+ if (!R \|\| !Register::isPhysicalRegister(R))
	+ continue;
	+ uint16_t Flags = NodeAttrs::None;
	+ if (Op.isUndef())
	+ Flags \|= NodeAttrs::Undef;
	+ if (TOI.isFixedReg(In, OpN))
	+ Flags \|= NodeAttrs::Fixed;
	+ NodeAddr<UseNode*> UA = newUse(SA, Op, Flags);
	+ SA.Addr->addMember(UA, *this);
	+ }
	+}
	+
	+// Scan all defs in the block node BA and record in PhiM the locations of
	+// phi nodes corresponding to these defs.
	+void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM,
	+ NodeAddr<BlockNode*> BA) {
	+ // Check all defs from block BA and record them in each block in BA's
	+ // iterated dominance frontier. This information will later be used to
	+ // create phi nodes.
	+ MachineBasicBlock *BB = BA.Addr->getCode();
	+ assert(BB);
	+ auto DFLoc = MDF.find(BB);
	+ if (DFLoc == MDF.end() \|\| DFLoc->second.empty())
	+ return;
	+
	+ // Traverse all instructions in the block and collect the set of all
	+ // defined references. For each reference there will be a phi created
	+ // in the block's iterated dominance frontier.
	+ // This is done to make sure that each defined reference gets only one
	+ // phi node, even if it is defined multiple times.
	+ RegisterSet Defs;
	+ for (NodeAddr<InstrNode> IA : BA.Addr->members(this))
	+ for (NodeAddr<RefNode> RA : IA.Addr->members_if(IsDef, this))
	+ Defs.insert(RA.Addr->getRegRef(*this));
	+
	+ // Calculate the iterated dominance frontier of BB.
	+ const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
	+ SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end());
	+ for (unsigned i = 0; i < IDF.size(); ++i) {
	+ auto F = MDF.find(IDF[i]);
	+ if (F != MDF.end())
	+ IDF.insert(F->second.begin(), F->second.end());
	+ }
	+
	+ // Finally, add the set of defs to each block in the iterated dominance
	+ // frontier.
	+ for (auto DB : IDF) {
	+ NodeAddr<BlockNode*> DBA = findBlock(DB);
	+ PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
	+ }
	+}
	+
	+// Given the locations of phi nodes in the map PhiM, create the phi nodes
	+// that are located in the block node BA.
	+void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
	+ NodeAddr<BlockNode*> BA) {
	+ // Check if this blocks has any DF defs, i.e. if there are any defs
	+ // that this block is in the iterated dominance frontier of.
	+ auto HasDF = PhiM.find(BA.Id);
	+ if (HasDF == PhiM.end() \|\| HasDF->second.empty())
	+ return;
	+
	+ // First, remove all R in Refs in such that there exists T in Refs
	+ // such that T covers R. In other words, only leave those refs that
	+ // are not covered by another ref (i.e. maximal with respect to covering).
	+
	+ auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
	+ for (RegisterRef I : RRs)
	+ if (I != RR && RegisterAggr::isCoverOf(I, RR, PRI))
	+ RR = I;
	+ return RR;
	+ };
	+
	+ RegisterSet MaxDF;
	+ for (RegisterRef I : HasDF->second)
	+ MaxDF.insert(MaxCoverIn(I, HasDF->second));
	+
	+ std::vector<RegisterRef> MaxRefs;
	+ for (RegisterRef I : MaxDF)
	+ MaxRefs.push_back(MaxCoverIn(I, AllRefs));
	+
	+ // Now, for each R in MaxRefs, get the alias closure of R. If the closure
	+ // only has R in it, create a phi a def for R. Otherwise, create a phi,
	+ // and add a def for each S in the closure.
	+
	+ // Sort the refs so that the phis will be created in a deterministic order.
	+ llvm::sort(MaxRefs);
	+ // Remove duplicates.
	+ auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
	+ MaxRefs.erase(NewEnd, MaxRefs.end());
	+
	+ auto Aliased = [this,&MaxRefs](RegisterRef RR,
	+ std::vector<unsigned> &Closure) -> bool {
	+ for (unsigned I : Closure)
	+ if (PRI.alias(RR, MaxRefs[I]))
	+ return true;
	+ return false;
	+ };
	+
	+ // Prepare a list of NodeIds of the block's predecessors.
	+ NodeList Preds;
	+ const MachineBasicBlock *MBB = BA.Addr->getCode();
	+ for (MachineBasicBlock *PB : MBB->predecessors())
	+ Preds.push_back(findBlock(PB));
	+
	+ while (!MaxRefs.empty()) {
	+ // Put the first element in the closure, and then add all subsequent
	+ // elements from MaxRefs to it, if they alias at least one element
	+ // already in the closure.
	+ // ClosureIdx: vector of indices in MaxRefs of members of the closure.
	+ std::vector<unsigned> ClosureIdx = { 0 };
	+ for (unsigned i = 1; i != MaxRefs.size(); ++i)
	+ if (Aliased(MaxRefs[i], ClosureIdx))
	+ ClosureIdx.push_back(i);
	+
	+ // Build a phi for the closure.
	+ unsigned CS = ClosureIdx.size();
	+ NodeAddr<PhiNode*> PA = newPhi(BA);
	+
	+ // Add defs.
	+ for (unsigned X = 0; X != CS; ++X) {
	+ RegisterRef RR = MaxRefs[ClosureIdx[X]];
	+ uint16_t PhiFlags = NodeAttrs::PhiRef \| NodeAttrs::Preserving;
	+ NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
	+ PA.Addr->addMember(DA, *this);
	+ }
	+ // Add phi uses.
	+ for (NodeAddr<BlockNode*> PBA : Preds) {
	+ for (unsigned X = 0; X != CS; ++X) {
	+ RegisterRef RR = MaxRefs[ClosureIdx[X]];
	+ NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
	+ PA.Addr->addMember(PUA, *this);
	+ }
	+ }
	+
	+ // Erase from MaxRefs all elements in the closure.
	+ auto Begin = MaxRefs.begin();
	+ for (unsigned i = ClosureIdx.size(); i != 0; --i)
	+ MaxRefs.erase(Begin + ClosureIdx[i-1]);
	+ }
	+}
	+
	+// Remove any unneeded phi nodes that were created during the build process.
	+void DataFlowGraph::removeUnusedPhis() {
	+ // This will remove unused phis, i.e. phis where each def does not reach
	+ // any uses or other defs. This will not detect or remove circular phi
	+ // chains that are otherwise dead. Unused/dead phis are created during
	+ // the build process and this function is intended to remove these cases
	+ // that are easily determinable to be unnecessary.
	+
	+ SetVector<NodeId> PhiQ;
	+ for (NodeAddr<BlockNode> BA : Func.Addr->members(this)) {
	+ for (auto P : BA.Addr->members_if(IsPhi, *this))
	+ PhiQ.insert(P.Id);
	+ }
	+
	+ static auto HasUsedDef = [](NodeList &Ms) -> bool {
	+ for (NodeAddr<NodeBase*> M : Ms) {
	+ if (M.Addr->getKind() != NodeAttrs::Def)
	+ continue;
	+ NodeAddr<DefNode*> DA = M;
	+ if (DA.Addr->getReachedDef() != 0 \|\| DA.Addr->getReachedUse() != 0)
	+ return true;
	+ }
	+ return false;
	+ };
	+
	+ // Any phi, if it is removed, may affect other phis (make them dead).
	+ // For each removed phi, collect the potentially affected phis and add
	+ // them back to the queue.
	+ while (!PhiQ.empty()) {
	+ auto PA = addr<PhiNode*>(PhiQ[0]);
	+ PhiQ.remove(PA.Id);
	+ NodeList Refs = PA.Addr->members(*this);
	+ if (HasUsedDef(Refs))
	+ continue;
	+ for (NodeAddr<RefNode*> RA : Refs) {
	+ if (NodeId RD = RA.Addr->getReachingDef()) {
	+ auto RDA = addr<DefNode*>(RD);
	+ NodeAddr<InstrNode> OA = RDA.Addr->getOwner(this);
	+ if (IsPhi(OA))
	+ PhiQ.insert(OA.Id);
	+ }
	+ if (RA.Addr->isDef())
	+ unlinkDef(RA, true);
	+ else
	+ unlinkUse(RA, true);
	+ }
	+ NodeAddr<BlockNode> BA = PA.Addr->getOwner(this);
	+ BA.Addr->removeMember(PA, *this);
	+ }
	+}
	+
	+// For a given reference node TA in an instruction node IA, connect the
	+// reaching def of TA to the appropriate def node. Create any shadow nodes
	+// as appropriate.
	+template <typename T>
	+void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
	+ DefStack &DS) {
	+ if (DS.empty())
	+ return;
	+ RegisterRef RR = TA.Addr->getRegRef(*this);
	+ NodeAddr<T> TAP;
	+
	+ // References from the def stack that have been examined so far.
	+ RegisterAggr Defs(PRI);
	+
	+ for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
	+ RegisterRef QR = I->Addr->getRegRef(*this);
	+
	+ // Skip all defs that are aliased to any of the defs that we have already
	+ // seen. If this completes a cover of RR, stop the stack traversal.
	+ bool Alias = Defs.hasAliasOf(QR);
	+ bool Cover = Defs.insert(QR).hasCoverOf(RR);
	+ if (Alias) {
	+ if (Cover)
	+ break;
	+ continue;
	+ }
	+
	+ // The reaching def.
	+ NodeAddr<DefNode> RDA = I;
	+
	+ // Pick the reached node.
	+ if (TAP.Id == 0) {
	+ TAP = TA;
	+ } else {
	+ // Mark the existing ref as "shadow" and create a new shadow.
	+ TAP.Addr->setFlags(TAP.Addr->getFlags() \| NodeAttrs::Shadow);
	+ TAP = getNextShadow(IA, TAP, true);
	+ }
	+
	+ // Create the link.
	+ TAP.Addr->linkToDef(TAP.Id, RDA);
	+
	+ if (Cover)
	+ break;
	+ }
	+}
	+
	+// Create data-flow links for all reference nodes in the statement node SA.
	+template <typename Predicate>
	+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA,
	+ Predicate P) {
	+#ifndef NDEBUG
	+ RegisterSet Defs;
	+#endif
	+
	+ // Link all nodes (upwards in the data-flow) with their reaching defs.
	+ for (NodeAddr<RefNode> RA : SA.Addr->members_if(P, this)) {
	+ uint16_t Kind = RA.Addr->getKind();
	+ assert(Kind == NodeAttrs::Def \|\| Kind == NodeAttrs::Use);
	+ RegisterRef RR = RA.Addr->getRegRef(*this);
	+#ifndef NDEBUG
	+ // Do not expect multiple defs of the same reference.
	+ assert(Kind != NodeAttrs::Def \|\| !Defs.count(RR));
	+ Defs.insert(RR);
	+#endif
	+
	+ auto F = DefM.find(RR.Reg);
	+ if (F == DefM.end())
	+ continue;
	+ DefStack &DS = F->second;
	+ if (Kind == NodeAttrs::Use)
	+ linkRefUp<UseNode*>(SA, RA, DS);
	+ else if (Kind == NodeAttrs::Def)
	+ linkRefUp<DefNode*>(SA, RA, DS);
	+ else
	+ llvm_unreachable("Unexpected node in instruction");
	+ }
	+}
	+
	+// Create data-flow links for all instructions in the block node BA. This
	+// will include updating any phi nodes in BA.
	+void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
	+ // Push block delimiters.
	+ markBlock(BA.Id, DefM);
	+
	+ auto IsClobber = [] (NodeAddr<RefNode*> RA) -> bool {
	+ return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering);
	+ };
	+ auto IsNoClobber = [] (NodeAddr<RefNode*> RA) -> bool {
	+ return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering);
	+ };
	+
	+ assert(BA.Addr && "block node address is needed to create a data-flow link");
	+ // For each non-phi instruction in the block, link all the defs and uses
	+ // to their reaching defs. For any member of the block (including phis),
	+ // push the defs on the corresponding stacks.
	+ for (NodeAddr<InstrNode> IA : BA.Addr->members(this)) {
	+ // Ignore phi nodes here. They will be linked part by part from the
	+ // predecessors.
	+ if (IA.Addr->getKind() == NodeAttrs::Stmt) {
	+ linkStmtRefs(DefM, IA, IsUse);
	+ linkStmtRefs(DefM, IA, IsClobber);
	+ }
	+
	+ // Push the definitions on the stack.
	+ pushClobbers(IA, DefM);
	+
	+ if (IA.Addr->getKind() == NodeAttrs::Stmt)
	+ linkStmtRefs(DefM, IA, IsNoClobber);
	+
	+ pushDefs(IA, DefM);
	+ }
	+
	+ // Recursively process all children in the dominator tree.
	+ MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
	+ for (auto I : *N) {
	+ MachineBasicBlock *SB = I->getBlock();
	+ NodeAddr<BlockNode*> SBA = findBlock(SB);
	+ linkBlockRefs(DefM, SBA);
	+ }
	+
	+ // Link the phi uses from the successor blocks.
	+ auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool {
	+ if (NA.Addr->getKind() != NodeAttrs::Use)
	+ return false;
	+ assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
	+ NodeAddr<PhiUseNode*> PUA = NA;
	+ return PUA.Addr->getPredecessor() == BA.Id;
	+ };
	+
	+ RegisterSet EHLiveIns = getLandingPadLiveIns();
	+ MachineBasicBlock *MBB = BA.Addr->getCode();
	+
	+ for (MachineBasicBlock *SB : MBB->successors()) {
	+ bool IsEHPad = SB->isEHPad();
	+ NodeAddr<BlockNode*> SBA = findBlock(SB);
	+ for (NodeAddr<InstrNode> IA : SBA.Addr->members_if(IsPhi, this)) {
	+ // Do not link phi uses for landing pad live-ins.
	+ if (IsEHPad) {
	+ // Find what register this phi is for.
	+ NodeAddr<RefNode> RA = IA.Addr->getFirstMember(this);
	+ assert(RA.Id != 0);
	+ if (EHLiveIns.count(RA.Addr->getRegRef(*this)))
	+ continue;
	+ }
	+ // Go over each phi use associated with MBB, and link it.
	+ for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
	+ NodeAddr<PhiUseNode*> PUA = U;
	+ RegisterRef RR = PUA.Addr->getRegRef(*this);
	+ linkRefUp<UseNode*>(IA, PUA, DefM[RR.Reg]);
	+ }
	+ }
	+ }
	+
	+ // Pop all defs from this block from the definition stacks.
	+ releaseBlock(BA.Id, DefM);
	+}
	+
	+// Remove the use node UA from any data-flow and structural links.
	+void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) {
	+ NodeId RD = UA.Addr->getReachingDef();
	+ NodeId Sib = UA.Addr->getSibling();
	+
	+ if (RD == 0) {
	+ assert(Sib == 0);
	+ return;
	+ }
	+
	+ auto RDA = addr<DefNode*>(RD);
	+ auto TA = addr<UseNode*>(RDA.Addr->getReachedUse());
	+ if (TA.Id == UA.Id) {
	+ RDA.Addr->setReachedUse(Sib);
	+ return;
	+ }
	+
	+ while (TA.Id != 0) {
	+ NodeId S = TA.Addr->getSibling();
	+ if (S == UA.Id) {
	+ TA.Addr->setSibling(UA.Addr->getSibling());
	+ return;
	+ }
	+ TA = addr<UseNode*>(S);
	+ }
	+}
	+
	+// Remove the def node DA from any data-flow and structural links.
	+void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) {
	+ //
	+ // RD
	+ // \| reached
	+ // \| def
	+ // :
	+ // .
	+ // +----+
	+ // ... -- \| DA \| -- ... -- 0 : sibling chain of DA
	+ // +----+
	+ // \| \| reached
	+ // \| : def
	+ // \| .
	+ // \| ... : Siblings (defs)
	+ // \|
	+ // : reached
	+ // . use
	+ // ... : sibling chain of reached uses
	+
	+ NodeId RD = DA.Addr->getReachingDef();
	+
	+ // Visit all siblings of the reached def and reset their reaching defs.
	+ // Also, defs reached by DA are now "promoted" to being reached by RD,
	+ // so all of them will need to be spliced into the sibling chain where
	+ // DA belongs.
	+ auto getAllNodes = [this] (NodeId N) -> NodeList {
	+ NodeList Res;
	+ while (N) {
	+ auto RA = addr<RefNode*>(N);
	+ // Keep the nodes in the exact sibling order.
	+ Res.push_back(RA);
	+ N = RA.Addr->getSibling();
	+ }
	+ return Res;
	+ };
	+ NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
	+ NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
	+
	+ if (RD == 0) {
	+ for (NodeAddr<RefNode*> I : ReachedDefs)
	+ I.Addr->setSibling(0);
	+ for (NodeAddr<RefNode*> I : ReachedUses)
	+ I.Addr->setSibling(0);
	+ }
	+ for (NodeAddr<DefNode*> I : ReachedDefs)
	+ I.Addr->setReachingDef(RD);
	+ for (NodeAddr<UseNode*> I : ReachedUses)
	+ I.Addr->setReachingDef(RD);
	+
	+ NodeId Sib = DA.Addr->getSibling();
	+ if (RD == 0) {
	+ assert(Sib == 0);
	+ return;
	+ }
	+
	+ // Update the reaching def node and remove DA from the sibling list.
	+ auto RDA = addr<DefNode*>(RD);
	+ auto TA = addr<DefNode*>(RDA.Addr->getReachedDef());
	+ if (TA.Id == DA.Id) {
	+ // If DA is the first reached def, just update the RD's reached def
	+ // to the DA's sibling.
	+ RDA.Addr->setReachedDef(Sib);
	+ } else {
	+ // Otherwise, traverse the sibling list of the reached defs and remove
	+ // DA from it.
	+ while (TA.Id != 0) {
	+ NodeId S = TA.Addr->getSibling();
	+ if (S == DA.Id) {
	+ TA.Addr->setSibling(Sib);
	+ break;
	+ }
	+ TA = addr<DefNode*>(S);
	+ }
	+ }
	+
	+ // Splice the DA's reached defs into the RDA's reached def chain.
	+ if (!ReachedDefs.empty()) {
	+ auto Last = NodeAddr<DefNode*>(ReachedDefs.back());
	+ Last.Addr->setSibling(RDA.Addr->getReachedDef());
	+ RDA.Addr->setReachedDef(ReachedDefs.front().Id);
	+ }
	+ // Splice the DA's reached uses into the RDA's reached use chain.
	+ if (!ReachedUses.empty()) {
	+ auto Last = NodeAddr<UseNode*>(ReachedUses.back());
	+ Last.Addr->setSibling(RDA.Addr->getReachedUse());
	+ RDA.Addr->setReachedUse(ReachedUses.front().Id);
	+ }
	+}

	Property changes on: head/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp (revision 362609)
	@@ -0,0 +1,1118 @@
	+//===- RDFLiveness.cpp ----------------------------------------------------===//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+//
	+// Computation of the liveness information from the data-flow graph.
	+//
	+// The main functionality of this code is to compute block live-in
	+// information. With the live-in information in place, the placement
	+// of kill flags can also be recalculated.
	+//
	+// The block live-in calculation is based on the ideas from the following
	+// publication:
	+//
	+// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
	+// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
	+// ACM Transactions on Architecture and Code Optimization, Association for
	+// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
	+// and Embedded Architectures and Compilers", 8 (4),
	+// <10.1145/2086696.2086706>. <hal-00647369>
	+//
	+#include "llvm/ADT/BitVector.h"
	+#include "llvm/ADT/STLExtras.h"
	+#include "llvm/ADT/SetVector.h"
	+#include "llvm/CodeGen/MachineBasicBlock.h"
	+#include "llvm/CodeGen/MachineDominanceFrontier.h"
	+#include "llvm/CodeGen/MachineDominators.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineInstr.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	+#include "llvm/CodeGen/TargetRegisterInfo.h"
	+#include "llvm/MC/LaneBitmask.h"
	+#include "llvm/MC/MCRegisterInfo.h"
	+#include "llvm/Support/CommandLine.h"
	+#include "llvm/Support/Debug.h"
	+#include "llvm/Support/ErrorHandling.h"
	+#include "llvm/Support/raw_ostream.h"
	+#include <algorithm>
	+#include <cassert>
	+#include <cstdint>
	+#include <iterator>
	+#include <map>
	+#include <utility>
	+#include <vector>
	+
	+using namespace llvm;
	+using namespace rdf;
	+
	+static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
	+ cl::Hidden, cl::desc("Maximum recursion level"));
	+
	+namespace llvm {
	+namespace rdf {
	+
	+ raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
	+ OS << '{';
	+ for (auto &I : P.Obj) {
	+ OS << ' ' << printReg(I.first, &P.G.getTRI()) << '{';
	+ for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
	+ OS << Print<NodeId>(J->first, P.G) << PrintLaneMaskOpt(J->second);
	+ if (++J != E)
	+ OS << ',';
	+ }
	+ OS << '}';
	+ }
	+ OS << " }";
	+ return OS;
	+ }
	+
	+} // end namespace rdf
	+} // end namespace llvm
	+
	+// The order in the returned sequence is the order of reaching defs in the
	+// upward traversal: the first def is the closest to the given reference RefA,
	+// the next one is further up, and so on.
	+// The list ends at a reaching phi def, or when the reference from RefA is
	+// covered by the defs in the list (see FullChain).
	+// This function provides two modes of operation:
	+// (1) Returning the sequence of reaching defs for a particular reference
	+// node. This sequence will terminate at the first phi node [1].
	+// (2) Returning a partial sequence of reaching defs, where the final goal
	+// is to traverse past phi nodes to the actual defs arising from the code
	+// itself.
	+// In mode (2), the register reference for which the search was started
	+// may be different from the reference node RefA, for which this call was
	+// made, hence the argument RefRR, which holds the original register.
	+// Also, some definitions may have already been encountered in a previous
	+// call that will influence register covering. The register references
	+// already defined are passed in through DefRRs.
	+// In mode (1), the "continuation" considerations do not apply, and the
	+// RefRR is the same as the register in RefA, and the set DefRRs is empty.
	+//
	+// [1] It is possible for multiple phi nodes to be included in the returned
	+// sequence:
	+// SubA = phi ...
	+// SubB = phi ...
	+// ... = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
	+// However, these phi nodes are independent from one another in terms of
	+// the data-flow.
	+
	+NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
	+ NodeAddr<RefNode*> RefA, bool TopShadows, bool FullChain,
	+ const RegisterAggr &DefRRs) {
	+ NodeList RDefs; // Return value.
	+ SetVector<NodeId> DefQ;
	+ SetVector<NodeId> Owners;
	+
	+ // Dead defs will be treated as if they were live, since they are actually
	+ // on the data-flow path. They cannot be ignored because even though they
	+ // do not generate meaningful values, they still modify registers.
	+
	+ // If the reference is undefined, there is nothing to do.
	+ if (RefA.Addr->getFlags() & NodeAttrs::Undef)
	+ return RDefs;
	+
	+ // The initial queue should not have reaching defs for shadows. The
	+ // whole point of a shadow is that it will have a reaching def that
	+ // is not aliased to the reaching defs of the related shadows.
	+ NodeId Start = RefA.Id;
	+ auto SNA = DFG.addr<RefNode*>(Start);
	+ if (NodeId RD = SNA.Addr->getReachingDef())
	+ DefQ.insert(RD);
	+ if (TopShadows) {
	+ for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA))
	+ if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
	+ DefQ.insert(RD);
	+ }
	+
	+ // Collect all the reaching defs, going up until a phi node is encountered,
	+ // or there are no more reaching defs. From this set, the actual set of
	+ // reaching defs will be selected.
	+ // The traversal upwards must go on until a covering def is encountered.
	+ // It is possible that a collection of non-covering (individually) defs
	+ // will be sufficient, but keep going until a covering one is found.
	+ for (unsigned i = 0; i < DefQ.size(); ++i) {
	+ auto TA = DFG.addr<DefNode*>(DefQ[i]);
	+ if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
	+ continue;
	+ // Stop at the covering/overwriting def of the initial register reference.
	+ RegisterRef RR = TA.Addr->getRegRef(DFG);
	+ if (!DFG.IsPreservingDef(TA))
	+ if (RegisterAggr::isCoverOf(RR, RefRR, PRI))
	+ continue;
	+ // Get the next level of reaching defs. This will include multiple
	+ // reaching defs for shadows.
	+ for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
	+ if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
	+ DefQ.insert(RD);
	+ }
	+
	+ // Remove all non-phi defs that are not aliased to RefRR, and collect
	+ // the owners of the remaining defs.
	+ SetVector<NodeId> Defs;
	+ for (NodeId N : DefQ) {
	+ auto TA = DFG.addr<DefNode*>(N);
	+ bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
	+ if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
	+ continue;
	+ Defs.insert(TA.Id);
	+ Owners.insert(TA.Addr->getOwner(DFG).Id);
	+ }
	+
	+ // Return the MachineBasicBlock containing a given instruction.
	+ auto Block = [this] (NodeAddr<InstrNode> IA) -> MachineBasicBlock {
	+ if (IA.Addr->getKind() == NodeAttrs::Stmt)
	+ return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent();
	+ assert(IA.Addr->getKind() == NodeAttrs::Phi);
	+ NodeAddr<PhiNode*> PA = IA;
	+ NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
	+ return BA.Addr->getCode();
	+ };
	+ // Less(A,B) iff instruction A is further down in the dominator tree than B.
	+ auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
	+ if (A == B)
	+ return false;
	+ auto OA = DFG.addr<InstrNode>(A), OB = DFG.addr<InstrNode>(B);
	+ MachineBasicBlock BA = Block(OA), BB = Block(OB);
	+ if (BA != BB)
	+ return MDT.dominates(BB, BA);
	+ // They are in the same block.
	+ bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
	+ bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
	+ if (StmtA) {
	+ if (!StmtB) // OB is a phi and phis dominate statements.
	+ return true;
	+ MachineInstr CA = NodeAddr<StmtNode>(OA).Addr->getCode();
	+ MachineInstr CB = NodeAddr<StmtNode>(OB).Addr->getCode();
	+ // The order must be linear, so tie-break such equalities.
	+ if (CA == CB)
	+ return A < B;
	+ return MDT.dominates(CB, CA);
	+ } else {
	+ // OA is a phi.
	+ if (StmtB)
	+ return false;
	+ // Both are phis. There is no ordering between phis (in terms of
	+ // the data-flow), so tie-break this via node id comparison.
	+ return A < B;
	+ }
	+ };
	+
	+ std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
	+ llvm::sort(Tmp, Less);
	+
	+ // The vector is a list of instructions, so that defs coming from
	+ // the same instruction don't need to be artificially ordered.
	+ // Then, when computing the initial segment, and iterating over an
	+ // instruction, pick the defs that contribute to the covering (i.e. is
	+ // not covered by previously added defs). Check the defs individually,
	+ // i.e. first check each def if is covered or not (without adding them
	+ // to the tracking set), and then add all the selected ones.
	+
	+ // The reason for this is this example:
	+ // d1<A>, d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
	+ // d3<C> If A \incl BuC, and B \incl AuC, then d2 would be
	+ // covered if we added A first, and A would be covered
	+ // if we added B first.
	+
	+ RegisterAggr RRs(DefRRs);
	+
	+ auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool {
	+ return TA.Addr->getKind() == NodeAttrs::Def &&
	+ Defs.count(TA.Id);
	+ };
	+ for (NodeId T : Tmp) {
	+ if (!FullChain && RRs.hasCoverOf(RefRR))
	+ break;
	+ auto TA = DFG.addr<InstrNode*>(T);
	+ bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
	+ NodeList Ds;
	+ for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) {
	+ RegisterRef QR = DA.Addr->getRegRef(DFG);
	+ // Add phi defs even if they are covered by subsequent defs. This is
	+ // for cases where the reached use is not covered by any of the defs
	+ // encountered so far: the phi def is needed to expose the liveness
	+ // of that use to the entry of the block.
	+ // Example:
	+ // phi d1<R3>(,d2,), ... Phi def d1 is covered by d2.
	+ // d2<R3>(d1,,u3), ...
	+ // ..., u3<D1>(d2) This use needs to be live on entry.
	+ if (FullChain \|\| IsPhi \|\| !RRs.hasCoverOf(QR))
	+ Ds.push_back(DA);
	+ }
	+ RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
	+ for (NodeAddr<DefNode*> DA : Ds) {
	+ // When collecting a full chain of definitions, do not consider phi
	+ // defs to actually define a register.
	+ uint16_t Flags = DA.Addr->getFlags();
	+ if (!FullChain \|\| !(Flags & NodeAttrs::PhiRef))
	+ if (!(Flags & NodeAttrs::Preserving)) // Don't care about Undef here.
	+ RRs.insert(DA.Addr->getRegRef(DFG));
	+ }
	+ }
	+
	+ auto DeadP = [](const NodeAddr<DefNode*> DA) -> bool {
	+ return DA.Addr->getFlags() & NodeAttrs::Dead;
	+ };
	+ RDefs.resize(std::distance(RDefs.begin(), llvm::remove_if(RDefs, DeadP)));
	+
	+ return RDefs;
	+}
	+
	+std::pair<NodeSet,bool>
	+Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
	+ NodeSet &Visited, const NodeSet &Defs) {
	+ return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest);
	+}
	+
	+std::pair<NodeSet,bool>
	+Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
	+ NodeSet &Visited, const NodeSet &Defs, unsigned Nest, unsigned MaxNest) {
	+ if (Nest > MaxNest)
	+ return { NodeSet(), false };
	+ // Collect all defined registers. Do not consider phis to be defining
	+ // anything, only collect "real" definitions.
	+ RegisterAggr DefRRs(PRI);
	+ for (NodeId D : Defs) {
	+ const auto DA = DFG.addr<const DefNode*>(D);
	+ if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
	+ DefRRs.insert(DA.Addr->getRegRef(DFG));
	+ }
	+
	+ NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs);
	+ if (RDs.empty())
	+ return { Defs, true };
	+
	+ // Make a copy of the preexisting definitions and add the newly found ones.
	+ NodeSet TmpDefs = Defs;
	+ for (NodeAddr<NodeBase*> R : RDs)
	+ TmpDefs.insert(R.Id);
	+
	+ NodeSet Result = Defs;
	+
	+ for (NodeAddr<DefNode*> DA : RDs) {
	+ Result.insert(DA.Id);
	+ if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
	+ continue;
	+ NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG);
	+ if (Visited.count(PA.Id))
	+ continue;
	+ Visited.insert(PA.Id);
	+ // Go over all phi uses and get the reaching defs for each use.
	+ for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
	+ const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
	+ Nest+1, MaxNest);
	+ if (!T.second)
	+ return { T.first, false };
	+ Result.insert(T.first.begin(), T.first.end());
	+ }
	+ }
	+
	+ return { Result, true };
	+}
	+
	+/// Find the nearest ref node aliased to RefRR, going upwards in the data
	+/// flow, starting from the instruction immediately preceding Inst.
	+NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR,
	+ NodeAddr<InstrNode*> IA) {
	+ NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
	+ NodeList Ins = BA.Addr->members(DFG);
	+ NodeId FindId = IA.Id;
	+ auto E = Ins.rend();
	+ auto B = std::find_if(Ins.rbegin(), E,
	+ [FindId] (const NodeAddr<InstrNode*> T) {
	+ return T.Id == FindId;
	+ });
	+ // Do not scan IA (which is what B would point to).
	+ if (B != E)
	+ ++B;
	+
	+ do {
	+ // Process the range of instructions from B to E.
	+ for (NodeAddr<InstrNode*> I : make_range(B, E)) {
	+ NodeList Refs = I.Addr->members(DFG);
	+ NodeAddr<RefNode*> Clob, Use;
	+ // Scan all the refs in I aliased to RefRR, and return the one that
	+ // is the closest to the output of I, i.e. def > clobber > use.
	+ for (NodeAddr<RefNode*> R : Refs) {
	+ if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR))
	+ continue;
	+ if (DFG.IsDef(R)) {
	+ // If it's a non-clobbering def, just return it.
	+ if (!(R.Addr->getFlags() & NodeAttrs::Clobbering))
	+ return R;
	+ Clob = R;
	+ } else {
	+ Use = R;
	+ }
	+ }
	+ if (Clob.Id != 0)
	+ return Clob;
	+ if (Use.Id != 0)
	+ return Use;
	+ }
	+
	+ // Go up to the immediate dominator, if any.
	+ MachineBasicBlock *BB = BA.Addr->getCode();
	+ BA = NodeAddr<BlockNode*>();
	+ if (MachineDomTreeNode *N = MDT.getNode(BB)) {
	+ if ((N = N->getIDom()))
	+ BA = DFG.findBlock(N->getBlock());
	+ }
	+ if (!BA.Id)
	+ break;
	+
	+ Ins = BA.Addr->members(DFG);
	+ B = Ins.rbegin();
	+ E = Ins.rend();
	+ } while (true);
	+
	+ return NodeAddr<RefNode*>();
	+}
	+
	+NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
	+ NodeAddr<DefNode*> DefA, const RegisterAggr &DefRRs) {
	+ NodeSet Uses;
	+
	+ // If the original register is already covered by all the intervening
	+ // defs, no more uses can be reached.
	+ if (DefRRs.hasCoverOf(RefRR))
	+ return Uses;
	+
	+ // Add all directly reached uses.
	+ // If the def is dead, it does not provide a value for any use.
	+ bool IsDead = DefA.Addr->getFlags() & NodeAttrs::Dead;
	+ NodeId U = !IsDead ? DefA.Addr->getReachedUse() : 0;
	+ while (U != 0) {
	+ auto UA = DFG.addr<UseNode*>(U);
	+ if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
	+ RegisterRef UR = UA.Addr->getRegRef(DFG);
	+ if (PRI.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
	+ Uses.insert(U);
	+ }
	+ U = UA.Addr->getSibling();
	+ }
	+
	+ // Traverse all reached defs. This time dead defs cannot be ignored.
	+ for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) {
	+ auto DA = DFG.addr<DefNode*>(D);
	+ NextD = DA.Addr->getSibling();
	+ RegisterRef DR = DA.Addr->getRegRef(DFG);
	+ // If this def is already covered, it cannot reach anything new.
	+ // Similarly, skip it if it is not aliased to the interesting register.
	+ if (DefRRs.hasCoverOf(DR) \|\| !PRI.alias(RefRR, DR))
	+ continue;
	+ NodeSet T;
	+ if (DFG.IsPreservingDef(DA)) {
	+ // If it is a preserving def, do not update the set of intervening defs.
	+ T = getAllReachedUses(RefRR, DA, DefRRs);
	+ } else {
	+ RegisterAggr NewDefRRs = DefRRs;
	+ NewDefRRs.insert(DR);
	+ T = getAllReachedUses(RefRR, DA, NewDefRRs);
	+ }
	+ Uses.insert(T.begin(), T.end());
	+ }
	+ return Uses;
	+}
	+
	+void Liveness::computePhiInfo() {
	+ RealUseMap.clear();
	+
	+ NodeList Phis;
	+ NodeAddr<FuncNode*> FA = DFG.getFunc();
	+ NodeList Blocks = FA.Addr->members(DFG);
	+ for (NodeAddr<BlockNode*> BA : Blocks) {
	+ auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
	+ Phis.insert(Phis.end(), Ps.begin(), Ps.end());
	+ }
	+
	+ // phi use -> (map: reaching phi -> set of registers defined in between)
	+ std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
	+ std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation.
	+ std::map<NodeId,RegisterAggr> PhiDRs; // Phi -> registers defined by it.
	+
	+ // Go over all phis.
	+ for (NodeAddr<PhiNode*> PhiA : Phis) {
	+ // Go over all defs and collect the reached uses that are non-phi uses
	+ // (i.e. the "real uses").
	+ RefMap &RealUses = RealUseMap[PhiA.Id];
	+ NodeList PhiRefs = PhiA.Addr->members(DFG);
	+
	+ // Have a work queue of defs whose reached uses need to be found.
	+ // For each def, add to the queue all reached (non-phi) defs.
	+ SetVector<NodeId> DefQ;
	+ NodeSet PhiDefs;
	+ RegisterAggr DRs(PRI);
	+ for (NodeAddr<RefNode*> R : PhiRefs) {
	+ if (!DFG.IsRef<NodeAttrs::Def>(R))
	+ continue;
	+ DRs.insert(R.Addr->getRegRef(DFG));
	+ DefQ.insert(R.Id);
	+ PhiDefs.insert(R.Id);
	+ }
	+ PhiDRs.insert(std::make_pair(PhiA.Id, DRs));
	+
	+ // Collect the super-set of all possible reached uses. This set will
	+ // contain all uses reached from this phi, either directly from the
	+ // phi defs, or (recursively) via non-phi defs reached by the phi defs.
	+ // This set of uses will later be trimmed to only contain these uses that
	+ // are actually reached by the phi defs.
	+ for (unsigned i = 0; i < DefQ.size(); ++i) {
	+ NodeAddr<DefNode> DA = DFG.addr<DefNode>(DefQ[i]);
	+ // Visit all reached uses. Phi defs should not really have the "dead"
	+ // flag set, but check it anyway for consistency.
	+ bool IsDead = DA.Addr->getFlags() & NodeAttrs::Dead;
	+ NodeId UN = !IsDead ? DA.Addr->getReachedUse() : 0;
	+ while (UN != 0) {
	+ NodeAddr<UseNode> A = DFG.addr<UseNode>(UN);
	+ uint16_t F = A.Addr->getFlags();
	+ if ((F & (NodeAttrs::Undef \| NodeAttrs::PhiRef)) == 0) {
	+ RegisterRef R = PRI.normalize(A.Addr->getRegRef(DFG));
	+ RealUses[R.Reg].insert({A.Id,R.Mask});
	+ }
	+ UN = A.Addr->getSibling();
	+ }
	+ // Visit all reached defs, and add them to the queue. These defs may
	+ // override some of the uses collected here, but that will be handled
	+ // later.
	+ NodeId DN = DA.Addr->getReachedDef();
	+ while (DN != 0) {
	+ NodeAddr<DefNode> A = DFG.addr<DefNode>(DN);
	+ for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
	+ uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags();
	+ // Must traverse the reached-def chain. Consider:
	+ // def(D0) -> def(R0) -> def(R0) -> use(D0)
	+ // The reachable use of D0 passes through a def of R0.
	+ if (!(Flags & NodeAttrs::PhiRef))
	+ DefQ.insert(T.Id);
	+ }
	+ DN = A.Addr->getSibling();
	+ }
	+ }
	+ // Filter out these uses that appear to be reachable, but really
	+ // are not. For example:
	+ //
	+ // R1:0 = d1
	+ // = R1:0 u2 Reached by d1.
	+ // R0 = d3
	+ // = R1:0 u4 Still reached by d1: indirectly through
	+ // the def d3.
	+ // R1 = d5
	+ // = R1:0 u6 Not reached by d1 (covered collectively
	+ // by d3 and d5), but following reached
	+ // defs and uses from d1 will lead here.
	+ for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
	+ // For each reached register UI->first, there is a set UI->second, of
	+ // uses of it. For each such use, check if it is reached by this phi,
	+ // i.e. check if the set of its reaching uses intersects the set of
	+ // this phi's defs.
	+ NodeRefSet Uses = UI->second;
	+ UI->second.clear();
	+ for (std::pair<NodeId,LaneBitmask> I : Uses) {
	+ auto UA = DFG.addr<UseNode*>(I.first);
	+ // Undef flag is checked above.
	+ assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
	+ RegisterRef R(UI->first, I.second);
	+ // Calculate the exposed part of the reached use.
	+ RegisterAggr Covered(PRI);
	+ for (NodeAddr<DefNode*> DA : getAllReachingDefs(R, UA)) {
	+ if (PhiDefs.count(DA.Id))
	+ break;
	+ Covered.insert(DA.Addr->getRegRef(DFG));
	+ }
	+ if (RegisterRef RC = Covered.clearIn(R)) {
	+ // We are updating the map for register UI->first, so we need
	+ // to map RC to be expressed in terms of that register.
	+ RegisterRef S = PRI.mapTo(RC, UI->first);
	+ UI->second.insert({I.first, S.Mask});
	+ }
	+ }
	+ UI = UI->second.empty() ? RealUses.erase(UI) : std::next(UI);
	+ }
	+
	+ // If this phi reaches some "real" uses, add it to the queue for upward
	+ // propagation.
	+ if (!RealUses.empty())
	+ PhiUQ.push_back(PhiA.Id);
	+
	+ // Go over all phi uses and check if the reaching def is another phi.
	+ // Collect the phis that are among the reaching defs of these uses.
	+ // While traversing the list of reaching defs for each phi use, accumulate
	+ // the set of registers defined between this phi (PhiA) and the owner phi
	+ // of the reaching def.
	+ NodeSet SeenUses;
	+
	+ for (auto I : PhiRefs) {
	+ if (!DFG.IsRef<NodeAttrs::Use>(I) \|\| SeenUses.count(I.Id))
	+ continue;
	+ NodeAddr<PhiUseNode*> PUA = I;
	+ if (PUA.Addr->getReachingDef() == 0)
	+ continue;
	+
	+ RegisterRef UR = PUA.Addr->getRegRef(DFG);
	+ NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs);
	+ RegisterAggr DefRRs(PRI);
	+
	+ for (NodeAddr<DefNode*> D : Ds) {
	+ if (D.Addr->getFlags() & NodeAttrs::PhiRef) {
	+ NodeId RP = D.Addr->getOwner(DFG).Id;
	+ std::map<NodeId,RegisterAggr> &M = PhiUp[PUA.Id];
	+ auto F = M.find(RP);
	+ if (F == M.end())
	+ M.insert(std::make_pair(RP, DefRRs));
	+ else
	+ F->second.insert(DefRRs);
	+ }
	+ DefRRs.insert(D.Addr->getRegRef(DFG));
	+ }
	+
	+ for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PhiA, PUA))
	+ SeenUses.insert(T.Id);
	+ }
	+ }
	+
	+ if (Trace) {
	+ dbgs() << "Phi-up-to-phi map with intervening defs:\n";
	+ for (auto I : PhiUp) {
	+ dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {";
	+ for (auto R : I.second)
	+ dbgs() << ' ' << Print<NodeId>(R.first, DFG)
	+ << Print<RegisterAggr>(R.second, DFG);
	+ dbgs() << " }\n";
	+ }
	+ }
	+
	+ // Propagate the reached registers up in the phi chain.
	+ //
	+ // The following type of situation needs careful handling:
	+ //
	+ // phi d1<R1:0> (1)
	+ // \|
	+ // ... d2<R1>
	+ // \|
	+ // phi u3<R1:0> (2)
	+ // \|
	+ // ... u4<R1>
	+ //
	+ // The phi node (2) defines a register pair R1:0, and reaches a "real"
	+ // use u4 of just R1. The same phi node is also known to reach (upwards)
	+ // the phi node (1). However, the use u4 is not reached by phi (1),
	+ // because of the intervening definition d2 of R1. The data flow between
	+ // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
	+ //
	+ // When propagating uses up the phi chains, get the all reaching defs
	+ // for a given phi use, and traverse the list until the propagated ref
	+ // is covered, or until reaching the final phi. Only assume that the
	+ // reference reaches the phi in the latter case.
	+
	+ for (unsigned i = 0; i < PhiUQ.size(); ++i) {
	+ auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
	+ NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG);
	+ RefMap &RUM = RealUseMap[PA.Id];
	+
	+ for (NodeAddr<UseNode*> UA : PUs) {
	+ std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
	+ RegisterRef UR = PRI.normalize(UA.Addr->getRegRef(DFG));
	+ for (const std::pair<const NodeId, RegisterAggr> &P : PUM) {
	+ bool Changed = false;
	+ const RegisterAggr &MidDefs = P.second;
	+
	+ // Collect the set PropUp of uses that are reached by the current
	+ // phi PA, and are not covered by any intervening def between the
	+ // currently visited use UA and the upward phi P.
	+
	+ if (MidDefs.hasCoverOf(UR))
	+ continue;
	+
	+ // General algorithm:
	+ // for each (R,U) : U is use node of R, U is reached by PA
	+ // if MidDefs does not cover (R,U)
	+ // then add (R-MidDefs,U) to RealUseMap[P]
	+ //
	+ for (const std::pair<const RegisterId, NodeRefSet> &T : RUM) {
	+ RegisterRef R(T.first);
	+ // The current phi (PA) could be a phi for a regmask. It could
	+ // reach a whole variety of uses that are not related to the
	+ // specific upward phi (P.first).
	+ const RegisterAggr &DRs = PhiDRs.at(P.first);
	+ if (!DRs.hasAliasOf(R))
	+ continue;
	+ R = PRI.mapTo(DRs.intersectWith(R), T.first);
	+ for (std::pair<NodeId,LaneBitmask> V : T.second) {
	+ LaneBitmask M = R.Mask & V.second;
	+ if (M.none())
	+ continue;
	+ if (RegisterRef SS = MidDefs.clearIn(RegisterRef(R.Reg, M))) {
	+ NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
	+ Changed \|= RS.insert({V.first,SS.Mask}).second;
	+ }
	+ }
	+ }
	+
	+ if (Changed)
	+ PhiUQ.push_back(P.first);
	+ }
	+ }
	+ }
	+
	+ if (Trace) {
	+ dbgs() << "Real use map:\n";
	+ for (auto I : RealUseMap) {
	+ dbgs() << "phi " << Print<NodeId>(I.first, DFG);
	+ NodeAddr<PhiNode> PA = DFG.addr<PhiNode>(I.first);
	+ NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
	+ if (!Ds.empty()) {
	+ RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(DFG);
	+ dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>';
	+ } else {
	+ dbgs() << "<noreg>";
	+ }
	+ dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n';
	+ }
	+ }
	+}
	+
	+void Liveness::computeLiveIns() {
	+ // Populate the node-to-block map. This speeds up the calculations
	+ // significantly.
	+ NBMap.clear();
	+ for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
	+ MachineBasicBlock *BB = BA.Addr->getCode();
	+ for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
	+ for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
	+ NBMap.insert(std::make_pair(RA.Id, BB));
	+ NBMap.insert(std::make_pair(IA.Id, BB));
	+ }
	+ }
	+
	+ MachineFunction &MF = DFG.getMF();
	+
	+ // Compute IDF first, then the inverse.
	+ decltype(IIDF) IDF;
	+ for (MachineBasicBlock &B : MF) {
	+ auto F1 = MDF.find(&B);
	+ if (F1 == MDF.end())
	+ continue;
	+ SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end());
	+ for (unsigned i = 0; i < IDFB.size(); ++i) {
	+ auto F2 = MDF.find(IDFB[i]);
	+ if (F2 != MDF.end())
	+ IDFB.insert(F2->second.begin(), F2->second.end());
	+ }
	+ // Add B to the IDF(B). This will put B in the IIDF(B).
	+ IDFB.insert(&B);
	+ IDF[&B].insert(IDFB.begin(), IDFB.end());
	+ }
	+
	+ for (auto I : IDF)
	+ for (auto S : I.second)
	+ IIDF[S].insert(I.first);
	+
	+ computePhiInfo();
	+
	+ NodeAddr<FuncNode*> FA = DFG.getFunc();
	+ NodeList Blocks = FA.Addr->members(DFG);
	+
	+ // Build the phi live-on-entry map.
	+ for (NodeAddr<BlockNode*> BA : Blocks) {
	+ MachineBasicBlock *MB = BA.Addr->getCode();
	+ RefMap &LON = PhiLON[MB];
	+ for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG))
	+ for (const RefMap::value_type &S : RealUseMap[P.Id])
	+ LON[S.first].insert(S.second.begin(), S.second.end());
	+ }
	+
	+ if (Trace) {
	+ dbgs() << "Phi live-on-entry map:\n";
	+ for (auto &I : PhiLON)
	+ dbgs() << "block #" << I.first->getNumber() << " -> "
	+ << Print<RefMap>(I.second, DFG) << '\n';
	+ }
	+
	+ // Build the phi live-on-exit map. Each phi node has some set of reached
	+ // "real" uses. Propagate this set backwards into the block predecessors
	+ // through the reaching defs of the corresponding phi uses.
	+ for (NodeAddr<BlockNode*> BA : Blocks) {
	+ NodeList Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
	+ for (NodeAddr<PhiNode*> PA : Phis) {
	+ RefMap &RUs = RealUseMap[PA.Id];
	+ if (RUs.empty())
	+ continue;
	+
	+ NodeSet SeenUses;
	+ for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
	+ if (!SeenUses.insert(U.Id).second)
	+ continue;
	+ NodeAddr<PhiUseNode*> PUA = U;
	+ if (PUA.Addr->getReachingDef() == 0)
	+ continue;
	+
	+ // Each phi has some set (possibly empty) of reached "real" uses,
	+ // that is, uses that are part of the compiled program. Such a use
	+ // may be located in some farther block, but following a chain of
	+ // reaching defs will eventually lead to this phi.
	+ // Any chain of reaching defs may fork at a phi node, but there
	+ // will be a path upwards that will lead to this phi. Now, this
	+ // chain will need to fork at this phi, since some of the reached
	+ // uses may have definitions joining in from multiple predecessors.
	+ // For each reached "real" use, identify the set of reaching defs
	+ // coming from each predecessor P, and add them to PhiLOX[P].
	+ //
	+ auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
	+ RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
	+
	+ for (const std::pair<const RegisterId, NodeRefSet> &RS : RUs) {
	+ // We need to visit each individual use.
	+ for (std::pair<NodeId,LaneBitmask> P : RS.second) {
	+ // Create a register ref corresponding to the use, and find
	+ // all reaching defs starting from the phi use, and treating
	+ // all related shadows as a single use cluster.
	+ RegisterRef S(RS.first, P.second);
	+ NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs);
	+ for (NodeAddr<DefNode*> D : Ds) {
	+ // Calculate the mask corresponding to the visited def.
	+ RegisterAggr TA(PRI);
	+ TA.insert(D.Addr->getRegRef(DFG)).intersect(S);
	+ LaneBitmask TM = TA.makeRegRef().Mask;
	+ LOX[S.Reg].insert({D.Id, TM});
	+ }
	+ }
	+ }
	+
	+ for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PA, PUA))
	+ SeenUses.insert(T.Id);
	+ } // for U : phi uses
	+ } // for P : Phis
	+ } // for B : Blocks
	+
	+ if (Trace) {
	+ dbgs() << "Phi live-on-exit map:\n";
	+ for (auto &I : PhiLOX)
	+ dbgs() << "block #" << I.first->getNumber() << " -> "
	+ << Print<RefMap>(I.second, DFG) << '\n';
	+ }
	+
	+ RefMap LiveIn;
	+ traverse(&MF.front(), LiveIn);
	+
	+ // Add function live-ins to the live-in set of the function entry block.
	+ LiveMap[&MF.front()].insert(DFG.getLiveIns());
	+
	+ if (Trace) {
	+ // Dump the liveness map
	+ for (MachineBasicBlock &B : MF) {
	+ std::vector<RegisterRef> LV;
	+ for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
	+ LV.push_back(RegisterRef(I->PhysReg, I->LaneMask));
	+ llvm::sort(LV);
	+ dbgs() << printMBBReference(B) << "\t rec = {";
	+ for (auto I : LV)
	+ dbgs() << ' ' << Print<RegisterRef>(I, DFG);
	+ dbgs() << " }\n";
	+ //dbgs() << "\tcomp = " << Print<RegisterAggr>(LiveMap[&B], DFG) << '\n';
	+
	+ LV.clear();
	+ const RegisterAggr &LG = LiveMap[&B];
	+ for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I)
	+ LV.push_back(*I);
	+ llvm::sort(LV);
	+ dbgs() << "\tcomp = {";
	+ for (auto I : LV)
	+ dbgs() << ' ' << Print<RegisterRef>(I, DFG);
	+ dbgs() << " }\n";
	+
	+ }
	+ }
	+}
	+
	+void Liveness::resetLiveIns() {
	+ for (auto &B : DFG.getMF()) {
	+ // Remove all live-ins.
	+ std::vector<unsigned> T;
	+ for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
	+ T.push_back(I->PhysReg);
	+ for (auto I : T)
	+ B.removeLiveIn(I);
	+ // Add the newly computed live-ins.
	+ const RegisterAggr &LiveIns = LiveMap[&B];
	+ for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
	+ RegisterRef R = *I;
	+ B.addLiveIn({MCPhysReg(R.Reg), R.Mask});
	+ }
	+ }
	+}
	+
	+void Liveness::resetKills() {
	+ for (auto &B : DFG.getMF())
	+ resetKills(&B);
	+}
	+
	+void Liveness::resetKills(MachineBasicBlock *B) {
	+ auto CopyLiveIns = [this] (MachineBasicBlock *B, BitVector &LV) -> void {
	+ for (auto I : B->liveins()) {
	+ MCSubRegIndexIterator S(I.PhysReg, &TRI);
	+ if (!S.isValid()) {
	+ LV.set(I.PhysReg);
	+ continue;
	+ }
	+ do {
	+ LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
	+ if ((M & I.LaneMask).any())
	+ LV.set(S.getSubReg());
	+ ++S;
	+ } while (S.isValid());
	+ }
	+ };
	+
	+ BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
	+ CopyLiveIns(B, LiveIn);
	+ for (auto SI : B->successors())
	+ CopyLiveIns(SI, Live);
	+
	+ for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
	+ MachineInstr MI = &I;
	+ if (MI->isDebugInstr())
	+ continue;
	+
	+ MI->clearKillInfo();
	+ for (auto &Op : MI->operands()) {
	+ // An implicit def of a super-register may not necessarily start a
	+ // live range of it, since an implicit use could be used to keep parts
	+ // of it live. Instead of analyzing the implicit operands, ignore
	+ // implicit defs.
	+ if (!Op.isReg() \|\| !Op.isDef() \|\| Op.isImplicit())
	+ continue;
	+ Register R = Op.getReg();
	+ if (!Register::isPhysicalRegister(R))
	+ continue;
	+ for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
	+ Live.reset(*SR);
	+ }
	+ for (auto &Op : MI->operands()) {
	+ if (!Op.isReg() \|\| !Op.isUse() \|\| Op.isUndef())
	+ continue;
	+ Register R = Op.getReg();
	+ if (!Register::isPhysicalRegister(R))
	+ continue;
	+ bool IsLive = false;
	+ for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
	+ if (!Live[*AR])
	+ continue;
	+ IsLive = true;
	+ break;
	+ }
	+ if (!IsLive)
	+ Op.setIsKill(true);
	+ for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
	+ Live.set(*SR);
	+ }
	+ }
	+}
	+
	+// Helper function to obtain the basic block containing the reaching def
	+// of the given use.
	+MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
	+ auto F = NBMap.find(RN);
	+ if (F != NBMap.end())
	+ return F->second;
	+ llvm_unreachable("Node id not in map");
	+}
	+
	+void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
	+ // The LiveIn map, for each (physical) register, contains the set of live
	+ // reaching defs of that register that are live on entry to the associated
	+ // block.
	+
	+ // The summary of the traversal algorithm:
	+ //
	+ // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
	+ // and (U \in IDF(B) or B dom U).
	+ //
	+ // for (C : children) {
	+ // LU = {}
	+ // traverse(C, LU)
	+ // LiveUses += LU
	+ // }
	+ //
	+ // LiveUses -= Defs(B);
	+ // LiveUses += UpwardExposedUses(B);
	+ // for (C : IIDF[B])
	+ // for (U : LiveUses)
	+ // if (Rdef(U) dom C)
	+ // C.addLiveIn(U)
	+ //
	+
	+ // Go up the dominator tree (depth-first).
	+ MachineDomTreeNode *N = MDT.getNode(B);
	+ for (auto I : *N) {
	+ RefMap L;
	+ MachineBasicBlock *SB = I->getBlock();
	+ traverse(SB, L);
	+
	+ for (auto S : L)
	+ LiveIn[S.first].insert(S.second.begin(), S.second.end());
	+ }
	+
	+ if (Trace) {
	+ dbgs() << "\n-- " << printMBBReference(*B) << ": " << __func__
	+ << " after recursion into: {";
	+ for (auto I : *N)
	+ dbgs() << ' ' << I->getBlock()->getNumber();
	+ dbgs() << " }\n";
	+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	+ }
	+
	+ // Add reaching defs of phi uses that are live on exit from this block.
	+ RefMap &PUs = PhiLOX[B];
	+ for (auto &S : PUs)
	+ LiveIn[S.first].insert(S.second.begin(), S.second.end());
	+
	+ if (Trace) {
	+ dbgs() << "after LOX\n";
	+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	+ }
	+
	+ // The LiveIn map at this point has all defs that are live-on-exit from B,
	+ // as if they were live-on-entry to B. First, we need to filter out all
	+ // defs that are present in this block. Then we will add reaching defs of
	+ // all upward-exposed uses.
	+
	+ // To filter out the defs, first make a copy of LiveIn, and then re-populate
	+ // LiveIn with the defs that should remain.
	+ RefMap LiveInCopy = LiveIn;
	+ LiveIn.clear();
	+
	+ for (const std::pair<const RegisterId, NodeRefSet> &LE : LiveInCopy) {
	+ RegisterRef LRef(LE.first);
	+ NodeRefSet &NewDefs = LiveIn[LRef.Reg]; // To be filled.
	+ const NodeRefSet &OldDefs = LE.second;
	+ for (NodeRef OR : OldDefs) {
	+ // R is a def node that was live-on-exit
	+ auto DA = DFG.addr<DefNode*>(OR.first);
	+ NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
	+ NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
	+ if (B != BA.Addr->getCode()) {
	+ // Defs from a different block need to be preserved. Defs from this
	+ // block will need to be processed further, except for phi defs, the
	+ // liveness of which is handled through the PhiLON/PhiLOX maps.
	+ NewDefs.insert(OR);
	+ continue;
	+ }
	+
	+ // Defs from this block need to stop the liveness from being
	+ // propagated upwards. This only applies to non-preserving defs,
	+ // and to the parts of the register actually covered by those defs.
	+ // (Note that phi defs should always be preserving.)
	+ RegisterAggr RRs(PRI);
	+ LRef.Mask = OR.second;
	+
	+ if (!DFG.IsPreservingDef(DA)) {
	+ assert(!(IA.Addr->getFlags() & NodeAttrs::Phi));
	+ // DA is a non-phi def that is live-on-exit from this block, and
	+ // that is also located in this block. LRef is a register ref
	+ // whose use this def reaches. If DA covers LRef, then no part
	+ // of LRef is exposed upwards.A
	+ if (RRs.insert(DA.Addr->getRegRef(DFG)).hasCoverOf(LRef))
	+ continue;
	+ }
	+
	+ // DA itself was not sufficient to cover LRef. In general, it is
	+ // the last in a chain of aliased defs before the exit from this block.
	+ // There could be other defs in this block that are a part of that
	+ // chain. Check that now: accumulate the registers from these defs,
	+ // and if they all together cover LRef, it is not live-on-entry.
	+ for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) {
	+ // DefNode -> InstrNode -> BlockNode.
	+ NodeAddr<InstrNode*> ITA = TA.Addr->getOwner(DFG);
	+ NodeAddr<BlockNode*> BTA = ITA.Addr->getOwner(DFG);
	+ // Reaching defs are ordered in the upward direction.
	+ if (BTA.Addr->getCode() != B) {
	+ // We have reached past the beginning of B, and the accumulated
	+ // registers are not covering LRef. The first def from the
	+ // upward chain will be live.
	+ // Subtract all accumulated defs (RRs) from LRef.
	+ RegisterRef T = RRs.clearIn(LRef);
	+ assert(T);
	+ NewDefs.insert({TA.Id,T.Mask});
	+ break;
	+ }
	+
	+ // TA is in B. Only add this def to the accumulated cover if it is
	+ // not preserving.
	+ if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
	+ RRs.insert(TA.Addr->getRegRef(DFG));
	+ // If this is enough to cover LRef, then stop.
	+ if (RRs.hasCoverOf(LRef))
	+ break;
	+ }
	+ }
	+ }
	+
	+ emptify(LiveIn);
	+
	+ if (Trace) {
	+ dbgs() << "after defs in block\n";
	+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	+ }
	+
	+ // Scan the block for upward-exposed uses and add them to the tracking set.
	+ for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
	+ NodeAddr<InstrNode*> IA = I;
	+ if (IA.Addr->getKind() != NodeAttrs::Stmt)
	+ continue;
	+ for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
	+ if (UA.Addr->getFlags() & NodeAttrs::Undef)
	+ continue;
	+ RegisterRef RR = PRI.normalize(UA.Addr->getRegRef(DFG));
	+ for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
	+ if (getBlockWithRef(D.Id) != B)
	+ LiveIn[RR.Reg].insert({D.Id,RR.Mask});
	+ }
	+ }
	+
	+ if (Trace) {
	+ dbgs() << "after uses in block\n";
	+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	+ dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	+ }
	+
	+ // Phi uses should not be propagated up the dominator tree, since they
	+ // are not dominated by their corresponding reaching defs.
	+ RegisterAggr &Local = LiveMap[B];
	+ RefMap &LON = PhiLON[B];
	+ for (auto &R : LON) {
	+ LaneBitmask M;
	+ for (auto P : R.second)
	+ M \|= P.second;
	+ Local.insert(RegisterRef(R.first,M));
	+ }
	+
	+ if (Trace) {
	+ dbgs() << "after phi uses in block\n";
	+ dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	+ dbgs() << " Local: " << Print<RegisterAggr>(Local, DFG) << '\n';
	+ }
	+
	+ for (auto C : IIDF[B]) {
	+ RegisterAggr &LiveC = LiveMap[C];
	+ for (const std::pair<const RegisterId, NodeRefSet> &S : LiveIn)
	+ for (auto R : S.second)
	+ if (MDT.properlyDominates(getBlockWithRef(R.first), C))
	+ LiveC.insert(RegisterRef(S.first, R.second));
	+ }
	+}
	+
	+void Liveness::emptify(RefMap &M) {
	+ for (auto I = M.begin(), E = M.end(); I != E; )
	+ I = I->second.empty() ? M.erase(I) : std::next(I);
	+}

	Property changes on: head/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp (revision 362609)
	@@ -0,0 +1,380 @@
	+//===- RDFRegisters.cpp ---------------------------------------------------===//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+
	+#include "llvm/ADT/BitVector.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineInstr.h"
	+#include "llvm/CodeGen/MachineOperand.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	+#include "llvm/CodeGen/TargetRegisterInfo.h"
	+#include "llvm/MC/LaneBitmask.h"
	+#include "llvm/MC/MCRegisterInfo.h"
	+#include "llvm/Support/ErrorHandling.h"
	+#include "llvm/Support/raw_ostream.h"
	+#include <cassert>
	+#include <cstdint>
	+#include <set>
	+#include <utility>
	+
	+using namespace llvm;
	+using namespace rdf;
	+
	+PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
	+ const MachineFunction &mf)
	+ : TRI(tri) {
	+ RegInfos.resize(TRI.getNumRegs());
	+
	+ BitVector BadRC(TRI.getNumRegs());
	+ for (const TargetRegisterClass *RC : TRI.regclasses()) {
	+ for (MCPhysReg R : *RC) {
	+ RegInfo &RI = RegInfos[R];
	+ if (RI.RegClass != nullptr && !BadRC[R]) {
	+ if (RC->LaneMask != RI.RegClass->LaneMask) {
	+ BadRC.set(R);
	+ RI.RegClass = nullptr;
	+ }
	+ } else
	+ RI.RegClass = RC;
	+ }
	+ }
	+
	+ UnitInfos.resize(TRI.getNumRegUnits());
	+
	+ for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
	+ if (UnitInfos[U].Reg != 0)
	+ continue;
	+ MCRegUnitRootIterator R(U, &TRI);
	+ assert(R.isValid());
	+ RegisterId F = *R;
	+ ++R;
	+ if (R.isValid()) {
	+ UnitInfos[U].Mask = LaneBitmask::getAll();
	+ UnitInfos[U].Reg = F;
	+ } else {
	+ for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) {
	+ std::pair<uint32_t,LaneBitmask> P = *I;
	+ UnitInfo &UI = UnitInfos[P.first];
	+ UI.Reg = F;
	+ if (P.second.any()) {
	+ UI.Mask = P.second;
	+ } else {
	+ if (const TargetRegisterClass *RC = RegInfos[F].RegClass)
	+ UI.Mask = RC->LaneMask;
	+ else
	+ UI.Mask = LaneBitmask::getAll();
	+ }
	+ }
	+ }
	+ }
	+
	+ for (const uint32_t *RM : TRI.getRegMasks())
	+ RegMasks.insert(RM);
	+ for (const MachineBasicBlock &B : mf)
	+ for (const MachineInstr &In : B)
	+ for (const MachineOperand &Op : In.operands())
	+ if (Op.isRegMask())
	+ RegMasks.insert(Op.getRegMask());
	+
	+ MaskInfos.resize(RegMasks.size()+1);
	+ for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) {
	+ BitVector PU(TRI.getNumRegUnits());
	+ const uint32_t *MB = RegMasks.get(M);
	+ for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
	+ if (!(MB[i/32] & (1u << (i%32))))
	+ continue;
	+ for (MCRegUnitIterator U(i, &TRI); U.isValid(); ++U)
	+ PU.set(*U);
	+ }
	+ MaskInfos[M].Units = PU.flip();
	+ }
	+}
	+
	+RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
	+ return RR;
	+}
	+
	+std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
	+ // Do not include RR in the alias set.
	+ std::set<RegisterId> AS;
	+ assert(isRegMaskId(Reg) \|\| Register::isPhysicalRegister(Reg));
	+ if (isRegMaskId(Reg)) {
	+ // XXX SLOW
	+ const uint32_t *MB = getRegMaskBits(Reg);
	+ for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
	+ if (MB[i/32] & (1u << (i%32)))
	+ continue;
	+ AS.insert(i);
	+ }
	+ for (const uint32_t *RM : RegMasks) {
	+ RegisterId MI = getRegMaskId(RM);
	+ if (MI != Reg && aliasMM(RegisterRef(Reg), RegisterRef(MI)))
	+ AS.insert(MI);
	+ }
	+ return AS;
	+ }
	+
	+ for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
	+ AS.insert(*AI);
	+ for (const uint32_t *RM : RegMasks) {
	+ RegisterId MI = getRegMaskId(RM);
	+ if (aliasRM(RegisterRef(Reg), RegisterRef(MI)))
	+ AS.insert(MI);
	+ }
	+ return AS;
	+}
	+
	+bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
	+ assert(Register::isPhysicalRegister(RA.Reg));
	+ assert(Register::isPhysicalRegister(RB.Reg));
	+
	+ MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
	+ MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
	+ // Reg units are returned in the numerical order.
	+ while (UMA.isValid() && UMB.isValid()) {
	+ // Skip units that are masked off in RA.
	+ std::pair<RegisterId,LaneBitmask> PA = *UMA;
	+ if (PA.second.any() && (PA.second & RA.Mask).none()) {
	+ ++UMA;
	+ continue;
	+ }
	+ // Skip units that are masked off in RB.
	+ std::pair<RegisterId,LaneBitmask> PB = *UMB;
	+ if (PB.second.any() && (PB.second & RB.Mask).none()) {
	+ ++UMB;
	+ continue;
	+ }
	+
	+ if (PA.first == PB.first)
	+ return true;
	+ if (PA.first < PB.first)
	+ ++UMA;
	+ else if (PB.first < PA.first)
	+ ++UMB;
	+ }
	+ return false;
	+}
	+
	+bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
	+ assert(Register::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
	+ const uint32_t *MB = getRegMaskBits(RM.Reg);
	+ bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32));
	+ // If the lane mask information is "full", e.g. when the given lane mask
	+ // is a superset of the lane mask from the register class, check the regmask
	+ // bit directly.
	+ if (RR.Mask == LaneBitmask::getAll())
	+ return !Preserved;
	+ const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass;
	+ if (RC != nullptr && (RR.Mask & RC->LaneMask) == RC->LaneMask)
	+ return !Preserved;
	+
	+ // Otherwise, check all subregisters whose lane mask overlaps the given
	+ // mask. For each such register, if it is preserved by the regmask, then
	+ // clear the corresponding bits in the given mask. If at the end, all
	+ // bits have been cleared, the register does not alias the regmask (i.e.
	+ // is it preserved by it).
	+ LaneBitmask M = RR.Mask;
	+ for (MCSubRegIndexIterator SI(RR.Reg, &TRI); SI.isValid(); ++SI) {
	+ LaneBitmask SM = TRI.getSubRegIndexLaneMask(SI.getSubRegIndex());
	+ if ((SM & RR.Mask).none())
	+ continue;
	+ unsigned SR = SI.getSubReg();
	+ if (!(MB[SR/32] & (1u << (SR%32))))
	+ continue;
	+ // The subregister SR is preserved.
	+ M &= ~SM;
	+ if (M.none())
	+ return false;
	+ }
	+
	+ return true;
	+}
	+
	+bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const {
	+ assert(isRegMaskId(RM.Reg) && isRegMaskId(RN.Reg));
	+ unsigned NumRegs = TRI.getNumRegs();
	+ const uint32_t *BM = getRegMaskBits(RM.Reg);
	+ const uint32_t *BN = getRegMaskBits(RN.Reg);
	+
	+ for (unsigned w = 0, nw = NumRegs/32; w != nw; ++w) {
	+ // Intersect the negations of both words. Disregard reg=0,
	+ // i.e. 0th bit in the 0th word.
	+ uint32_t C = ~BM[w] & ~BN[w];
	+ if (w == 0)
	+ C &= ~1;
	+ if (C)
	+ return true;
	+ }
	+
	+ // Check the remaining registers in the last word.
	+ unsigned TailRegs = NumRegs % 32;
	+ if (TailRegs == 0)
	+ return false;
	+ unsigned TW = NumRegs / 32;
	+ uint32_t TailMask = (1u << TailRegs) - 1;
	+ if (~BM[TW] & ~BN[TW] & TailMask)
	+ return true;
	+
	+ return false;
	+}
	+
	+RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, unsigned R) const {
	+ if (RR.Reg == R)
	+ return RR;
	+ if (unsigned Idx = TRI.getSubRegIndex(R, RR.Reg))
	+ return RegisterRef(R, TRI.composeSubRegIndexLaneMask(Idx, RR.Mask));
	+ if (unsigned Idx = TRI.getSubRegIndex(RR.Reg, R)) {
	+ const RegInfo &RI = RegInfos[R];
	+ LaneBitmask RCM = RI.RegClass ? RI.RegClass->LaneMask
	+ : LaneBitmask::getAll();
	+ LaneBitmask M = TRI.reverseComposeSubRegIndexLaneMask(Idx, RR.Mask);
	+ return RegisterRef(R, M & RCM);
	+ }
	+ llvm_unreachable("Invalid arguments: unrelated registers?");
	+}
	+
	+bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
	+ if (PhysicalRegisterInfo::isRegMaskId(RR.Reg))
	+ return Units.anyCommon(PRI.getMaskUnits(RR.Reg));
	+
	+ for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
	+ std::pair<uint32_t,LaneBitmask> P = *U;
	+ if (P.second.none() \|\| (P.second & RR.Mask).any())
	+ if (Units.test(P.first))
	+ return true;
	+ }
	+ return false;
	+}
	+
	+bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
	+ if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
	+ BitVector T(PRI.getMaskUnits(RR.Reg));
	+ return T.reset(Units).none();
	+ }
	+
	+ for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
	+ std::pair<uint32_t,LaneBitmask> P = *U;
	+ if (P.second.none() \|\| (P.second & RR.Mask).any())
	+ if (!Units.test(P.first))
	+ return false;
	+ }
	+ return true;
	+}
	+
	+RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
	+ if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
	+ Units \|= PRI.getMaskUnits(RR.Reg);
	+ return *this;
	+ }
	+
	+ for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
	+ std::pair<uint32_t,LaneBitmask> P = *U;
	+ if (P.second.none() \|\| (P.second & RR.Mask).any())
	+ Units.set(P.first);
	+ }
	+ return *this;
	+}
	+
	+RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
	+ Units \|= RG.Units;
	+ return *this;
	+}
	+
	+RegisterAggr &RegisterAggr::intersect(RegisterRef RR) {
	+ return intersect(RegisterAggr(PRI).insert(RR));
	+}
	+
	+RegisterAggr &RegisterAggr::intersect(const RegisterAggr &RG) {
	+ Units &= RG.Units;
	+ return *this;
	+}
	+
	+RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
	+ return clear(RegisterAggr(PRI).insert(RR));
	+}
	+
	+RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
	+ Units.reset(RG.Units);
	+ return *this;
	+}
	+
	+RegisterRef RegisterAggr::intersectWith(RegisterRef RR) const {
	+ RegisterAggr T(PRI);
	+ T.insert(RR).intersect(*this);
	+ if (T.empty())
	+ return RegisterRef();
	+ RegisterRef NR = T.makeRegRef();
	+ assert(NR);
	+ return NR;
	+}
	+
	+RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
	+ return RegisterAggr(PRI).insert(RR).clear(*this).makeRegRef();
	+}
	+
	+RegisterRef RegisterAggr::makeRegRef() const {
	+ int U = Units.find_first();
	+ if (U < 0)
	+ return RegisterRef();
	+
	+ auto AliasedRegs = [this] (uint32_t Unit, BitVector &Regs) {
	+ for (MCRegUnitRootIterator R(Unit, &PRI.getTRI()); R.isValid(); ++R)
	+ for (MCSuperRegIterator S(*R, &PRI.getTRI(), true); S.isValid(); ++S)
	+ Regs.set(*S);
	+ };
	+
	+ // Find the set of all registers that are aliased to all the units
	+ // in this aggregate.
	+
	+ // Get all the registers aliased to the first unit in the bit vector.
	+ BitVector Regs(PRI.getTRI().getNumRegs());
	+ AliasedRegs(U, Regs);
	+ U = Units.find_next(U);
	+
	+ // For each other unit, intersect it with the set of all registers
	+ // aliased that unit.
	+ while (U >= 0) {
	+ BitVector AR(PRI.getTRI().getNumRegs());
	+ AliasedRegs(U, AR);
	+ Regs &= AR;
	+ U = Units.find_next(U);
	+ }
	+
	+ // If there is at least one register remaining, pick the first one,
	+ // and consolidate the masks of all of its units contained in this
	+ // aggregate.
	+
	+ int F = Regs.find_first();
	+ if (F <= 0)
	+ return RegisterRef();
	+
	+ LaneBitmask M;
	+ for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) {
	+ std::pair<uint32_t,LaneBitmask> P = *I;
	+ if (Units.test(P.first))
	+ M \|= P.second.none() ? LaneBitmask::getAll() : P.second;
	+ }
	+ return RegisterRef(F, M);
	+}
	+
	+void RegisterAggr::print(raw_ostream &OS) const {
	+ OS << '{';
	+ for (int U = Units.find_first(); U >= 0; U = Units.find_next(U))
	+ OS << ' ' << printRegUnit(U, &PRI.getTRI());
	+ OS << " }";
	+}
	+
	+RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG,
	+ bool End)
	+ : Owner(&RG) {
	+ for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) {
	+ RegisterRef R = RG.PRI.getRefForUnit(U);
	+ Masks[R.Reg] \|= R.Mask;
	+ }
	+ Pos = End ? Masks.end() : Masks.begin();
	+ Index = End ? Masks.size() : 0;
	+}

	Property changes on: head/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/LTO/LTO.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/LTO/LTO.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/LTO/LTO.cpp (revision 362609)
	@@ -1,1418 +1,1438 @@
	//===-LTO.cpp - LLVM Link Time Optimizer ----------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements functions and classes used to support LTO.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/LTO/LTO.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Bitcode/BitcodeReader.h"
	#include "llvm/Bitcode/BitcodeWriter.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/IR/AutoUpgrade.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/RemarkStreamer.h"
	#include "llvm/LTO/LTOBackend.h"
	#include "llvm/LTO/SummaryBasedOptimizations.h"
	#include "llvm/Linker/IRMover.h"
	#include "llvm/Object/IRObjectFile.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/SHA1.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/ThreadPool.h"
	#include "llvm/Support/Threading.h"
	#include "llvm/Support/VCSRevision.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Transforms/IPO.h"
	#include "llvm/Transforms/IPO/PassManagerBuilder.h"
	#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
	#include "llvm/Transforms/Utils/FunctionImportUtils.h"
	#include "llvm/Transforms/Utils/SplitModule.h"

	#include <set>

	using namespace llvm;
	using namespace lto;
	using namespace object;

	#define DEBUG_TYPE "lto"

	static cl::opt<bool>
	DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
	cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));

	/// Enable global value internalization in LTO.
	cl::opt<bool> EnableLTOInternalization(
	"enable-lto-internalization", cl::init(true), cl::Hidden,
	cl::desc("Enable global value internalization in LTO"));

	// Computes a unique hash for the Module considering the current list of
	// export/import and other global analysis results.
	// The hash is produced in \p Key.
	void llvm::computeLTOCacheKey(
	SmallString<40> &Key, const Config &Conf, const ModuleSummaryIndex &Index,
	StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
	const FunctionImporter::ExportSetTy &ExportList,
	const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
	const GVSummaryMapTy &DefinedGlobals,
	const std::set<GlobalValue::GUID> &CfiFunctionDefs,
	const std::set<GlobalValue::GUID> &CfiFunctionDecls) {
	// Compute the unique hash for this entry.
	// This is based on the current compiler version, the module itself, the
	// export list, the hash for every single module in the import list, the
	// list of ResolvedODR for the module, and the list of preserved symbols.
	SHA1 Hasher;

	// Start with the compiler revision
	Hasher.update(LLVM_VERSION_STRING);
	#ifdef LLVM_REVISION
	Hasher.update(LLVM_REVISION);
	#endif

	// Include the parts of the LTO configuration that affect code generation.
	auto AddString = [&](StringRef Str) {
	Hasher.update(Str);
	Hasher.update(ArrayRef<uint8_t>{0});
	};
	auto AddUnsigned = [&](unsigned I) {
	uint8_t Data[4];
	Data[0] = I;
	Data[1] = I >> 8;
	Data[2] = I >> 16;
	Data[3] = I >> 24;
	Hasher.update(ArrayRef<uint8_t>{Data, 4});
	};
	auto AddUint64 = [&](uint64_t I) {
	uint8_t Data[8];
	Data[0] = I;
	Data[1] = I >> 8;
	Data[2] = I >> 16;
	Data[3] = I >> 24;
	Data[4] = I >> 32;
	Data[5] = I >> 40;
	Data[6] = I >> 48;
	Data[7] = I >> 56;
	Hasher.update(ArrayRef<uint8_t>{Data, 8});
	};
	AddString(Conf.CPU);
	// FIXME: Hash more of Options. For now all clients initialize Options from
	// command-line flags (which is unsupported in production), but may set
	// RelaxELFRelocations. The clang driver can also pass FunctionSections,
	// DataSections and DebuggerTuning via command line flags.
	AddUnsigned(Conf.Options.RelaxELFRelocations);
	AddUnsigned(Conf.Options.FunctionSections);
	AddUnsigned(Conf.Options.DataSections);
	AddUnsigned((unsigned)Conf.Options.DebuggerTuning);
	for (auto &A : Conf.MAttrs)
	AddString(A);
	if (Conf.RelocModel)
	AddUnsigned(*Conf.RelocModel);
	else
	AddUnsigned(-1);
	if (Conf.CodeModel)
	AddUnsigned(*Conf.CodeModel);
	else
	AddUnsigned(-1);
	AddUnsigned(Conf.CGOptLevel);
	AddUnsigned(Conf.CGFileType);
	AddUnsigned(Conf.OptLevel);
	AddUnsigned(Conf.UseNewPM);
	AddUnsigned(Conf.Freestanding);
	AddString(Conf.OptPipeline);
	AddString(Conf.AAPipeline);
	AddString(Conf.OverrideTriple);
	AddString(Conf.DefaultTriple);
	AddString(Conf.DwoDir);

	// Include the hash for the current module
	auto ModHash = Index.getModuleHash(ModuleID);
	Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
	+
	+ std::vector<uint64_t> ExportsGUID;
	+ ExportsGUID.reserve(ExportList.size());
	for (const auto &VI : ExportList) {
	auto GUID = VI.getGUID();
	+ ExportsGUID.push_back(GUID);
	+ }
	+
	+ // Sort the export list elements GUIDs.
	+ llvm::sort(ExportsGUID);
	+ for (uint64_t GUID : ExportsGUID) {
	// The export list can impact the internalization, be conservative here
	Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
	}

	// Include the hash for every module we import functions from. The set of
	// imported symbols for each module may affect code generation and is
	// sensitive to link order, so include that as well.
	- for (auto &Entry : ImportList) {
	- auto ModHash = Index.getModuleHash(Entry.first());
	+ using ImportMapIteratorTy = FunctionImporter::ImportMapTy::const_iterator;
	+ std::vector<ImportMapIteratorTy> ImportModulesVector;
	+ ImportModulesVector.reserve(ImportList.size());
	+
	+ for (ImportMapIteratorTy It = ImportList.begin(); It != ImportList.end();
	+ ++It) {
	+ ImportModulesVector.push_back(It);
	+ }
	+ llvm::sort(ImportModulesVector,
	+ [](const ImportMapIteratorTy &Lhs, const ImportMapIteratorTy &Rhs)
	+ -> bool { return Lhs->getKey() < Rhs->getKey(); });
	+ for (const ImportMapIteratorTy &EntryIt : ImportModulesVector) {
	+ auto ModHash = Index.getModuleHash(EntryIt->first());
	Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));

	- AddUint64(Entry.second.size());
	- for (auto &Fn : Entry.second)
	+ AddUint64(EntryIt->second.size());
	+ for (auto &Fn : EntryIt->second)
	AddUint64(Fn);
	}

	// Include the hash for the resolved ODR.
	for (auto &Entry : ResolvedODR) {
	Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.first,
	sizeof(GlobalValue::GUID)));
	Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.second,
	sizeof(GlobalValue::LinkageTypes)));
	}

	// Members of CfiFunctionDefs and CfiFunctionDecls that are referenced or
	// defined in this module.
	std::set<GlobalValue::GUID> UsedCfiDefs;
	std::set<GlobalValue::GUID> UsedCfiDecls;

	// Typeids used in this module.
	std::set<GlobalValue::GUID> UsedTypeIds;

	auto AddUsedCfiGlobal = [&](GlobalValue::GUID ValueGUID) {
	if (CfiFunctionDefs.count(ValueGUID))
	UsedCfiDefs.insert(ValueGUID);
	if (CfiFunctionDecls.count(ValueGUID))
	UsedCfiDecls.insert(ValueGUID);
	};

	auto AddUsedThings = [&](GlobalValueSummary *GS) {
	if (!GS) return;
	AddUnsigned(GS->isLive());
	AddUnsigned(GS->canAutoHide());
	for (const ValueInfo &VI : GS->refs()) {
	AddUnsigned(VI.isDSOLocal());
	AddUsedCfiGlobal(VI.getGUID());
	}
	if (auto *GVS = dyn_cast<GlobalVarSummary>(GS)) {
	AddUnsigned(GVS->maybeReadOnly());
	AddUnsigned(GVS->maybeWriteOnly());
	}
	if (auto *FS = dyn_cast<FunctionSummary>(GS)) {
	for (auto &TT : FS->type_tests())
	UsedTypeIds.insert(TT);
	for (auto &TT : FS->type_test_assume_vcalls())
	UsedTypeIds.insert(TT.GUID);
	for (auto &TT : FS->type_checked_load_vcalls())
	UsedTypeIds.insert(TT.GUID);
	for (auto &TT : FS->type_test_assume_const_vcalls())
	UsedTypeIds.insert(TT.VFunc.GUID);
	for (auto &TT : FS->type_checked_load_const_vcalls())
	UsedTypeIds.insert(TT.VFunc.GUID);
	for (auto &ET : FS->calls()) {
	AddUnsigned(ET.first.isDSOLocal());
	AddUsedCfiGlobal(ET.first.getGUID());
	}
	}
	};

	// Include the hash for the linkage type to reflect internalization and weak
	// resolution, and collect any used type identifier resolutions.
	for (auto &GS : DefinedGlobals) {
	GlobalValue::LinkageTypes Linkage = GS.second->linkage();
	Hasher.update(
	ArrayRef<uint8_t>((const uint8_t *)&Linkage, sizeof(Linkage)));
	AddUsedCfiGlobal(GS.first);
	AddUsedThings(GS.second);
	}

	// Imported functions may introduce new uses of type identifier resolutions,
	// so we need to collect their used resolutions as well.
	for (auto &ImpM : ImportList)
	for (auto &ImpF : ImpM.second) {
	GlobalValueSummary *S = Index.findSummaryInModule(ImpF, ImpM.first());
	AddUsedThings(S);
	// If this is an alias, we also care about any types/etc. that the aliasee
	// may reference.
	if (auto *AS = dyn_cast_or_null<AliasSummary>(S))
	AddUsedThings(AS->getBaseObject());
	}

	auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
	AddString(TId);

	AddUnsigned(S.TTRes.TheKind);
	AddUnsigned(S.TTRes.SizeM1BitWidth);

	AddUint64(S.TTRes.AlignLog2);
	AddUint64(S.TTRes.SizeM1);
	AddUint64(S.TTRes.BitMask);
	AddUint64(S.TTRes.InlineBits);

	AddUint64(S.WPDRes.size());
	for (auto &WPD : S.WPDRes) {
	AddUnsigned(WPD.first);
	AddUnsigned(WPD.second.TheKind);
	AddString(WPD.second.SingleImplName);

	AddUint64(WPD.second.ResByArg.size());
	for (auto &ByArg : WPD.second.ResByArg) {
	AddUint64(ByArg.first.size());
	for (uint64_t Arg : ByArg.first)
	AddUint64(Arg);
	AddUnsigned(ByArg.second.TheKind);
	AddUint64(ByArg.second.Info);
	AddUnsigned(ByArg.second.Byte);
	AddUnsigned(ByArg.second.Bit);
	}
	}
	};

	// Include the hash for all type identifiers used by this module.
	for (GlobalValue::GUID TId : UsedTypeIds) {
	auto TidIter = Index.typeIds().equal_range(TId);
	for (auto It = TidIter.first; It != TidIter.second; ++It)
	AddTypeIdSummary(It->second.first, It->second.second);
	}

	AddUnsigned(UsedCfiDefs.size());
	for (auto &V : UsedCfiDefs)
	AddUint64(V);

	AddUnsigned(UsedCfiDecls.size());
	for (auto &V : UsedCfiDecls)
	AddUint64(V);

	if (!Conf.SampleProfile.empty()) {
	auto FileOrErr = MemoryBuffer::getFile(Conf.SampleProfile);
	if (FileOrErr) {
	Hasher.update(FileOrErr.get()->getBuffer());

	if (!Conf.ProfileRemapping.empty()) {
	FileOrErr = MemoryBuffer::getFile(Conf.ProfileRemapping);
	if (FileOrErr)
	Hasher.update(FileOrErr.get()->getBuffer());
	}
	}
	}

	Key = toHex(Hasher.result());
	}

	static void thinLTOResolvePrevailingGUID(
	ValueInfo VI, DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias,
	function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
	isPrevailing,
	function_ref<void(StringRef, GlobalValue::GUID, GlobalValue::LinkageTypes)>
	recordNewLinkage,
	const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
	for (auto &S : VI.getSummaryList()) {
	GlobalValue::LinkageTypes OriginalLinkage = S->linkage();
	// Ignore local and appending linkage values since the linker
	// doesn't resolve them.
	if (GlobalValue::isLocalLinkage(OriginalLinkage) \|\|
	GlobalValue::isAppendingLinkage(S->linkage()))
	continue;
	// We need to emit only one of these. The prevailing module will keep it,
	// but turned into a weak, while the others will drop it when possible.
	// This is both a compile-time optimization and a correctness
	// transformation. This is necessary for correctness when we have exported
	// a reference - we need to convert the linkonce to weak to
	// ensure a copy is kept to satisfy the exported reference.
	// FIXME: We may want to split the compile time and correctness
	// aspects into separate routines.
	if (isPrevailing(VI.getGUID(), S.get())) {
	if (GlobalValue::isLinkOnceLinkage(OriginalLinkage)) {
	S->setLinkage(GlobalValue::getWeakLinkage(
	GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
	// The kept copy is eligible for auto-hiding (hidden visibility) if all
	// copies were (i.e. they were all linkonce_odr global unnamed addr).
	// If any copy is not (e.g. it was originally weak_odr), then the symbol
	// must remain externally available (e.g. a weak_odr from an explicitly
	// instantiated template). Additionally, if it is in the
	// GUIDPreservedSymbols set, that means that it is visibile outside
	// the summary (e.g. in a native object or a bitcode file without
	// summary), and in that case we cannot hide it as it isn't possible to
	// check all copies.
	S->setCanAutoHide(VI.canAutoHide() &&
	!GUIDPreservedSymbols.count(VI.getGUID()));
	}
	}
	// Alias and aliasee can't be turned into available_externally.
	else if (!isa<AliasSummary>(S.get()) &&
	!GlobalInvolvedWithAlias.count(S.get()))
	S->setLinkage(GlobalValue::AvailableExternallyLinkage);
	if (S->linkage() != OriginalLinkage)
	recordNewLinkage(S->modulePath(), VI.getGUID(), S->linkage());
	}
	}

	/// Resolve linkage for prevailing symbols in the \p Index.
	//
	// We'd like to drop these functions if they are no longer referenced in the
	// current module. However there is a chance that another module is still
	// referencing them because of the import. We make sure we always emit at least
	// one copy.
	void llvm::thinLTOResolvePrevailingInIndex(
	ModuleSummaryIndex &Index,
	function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
	isPrevailing,
	function_ref<void(StringRef, GlobalValue::GUID, GlobalValue::LinkageTypes)>
	recordNewLinkage,
	const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
	// We won't optimize the globals that are referenced by an alias for now
	// Ideally we should turn the alias into a global and duplicate the definition
	// when needed.
	DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
	for (auto &I : Index)
	for (auto &S : I.second.SummaryList)
	if (auto AS = dyn_cast<AliasSummary>(S.get()))
	GlobalInvolvedWithAlias.insert(&AS->getAliasee());

	for (auto &I : Index)
	thinLTOResolvePrevailingGUID(Index.getValueInfo(I), GlobalInvolvedWithAlias,
	isPrevailing, recordNewLinkage,
	GUIDPreservedSymbols);
	}

	static bool isWeakObjectWithRWAccess(GlobalValueSummary *GVS) {
	if (auto *VarSummary = dyn_cast<GlobalVarSummary>(GVS->getBaseObject()))
	return !VarSummary->maybeReadOnly() && !VarSummary->maybeWriteOnly() &&
	(VarSummary->linkage() == GlobalValue::WeakODRLinkage \|\|
	VarSummary->linkage() == GlobalValue::LinkOnceODRLinkage);
	return false;
	}

	static void thinLTOInternalizeAndPromoteGUID(
	ValueInfo VI, function_ref<bool(StringRef, ValueInfo)> isExported,
	function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
	isPrevailing) {
	for (auto &S : VI.getSummaryList()) {
	if (isExported(S->modulePath(), VI)) {
	if (GlobalValue::isLocalLinkage(S->linkage()))
	S->setLinkage(GlobalValue::ExternalLinkage);
	} else if (EnableLTOInternalization &&
	// Ignore local and appending linkage values since the linker
	// doesn't resolve them.
	!GlobalValue::isLocalLinkage(S->linkage()) &&
	(!GlobalValue::isInterposableLinkage(S->linkage()) \|\|
	isPrevailing(VI.getGUID(), S.get())) &&
	S->linkage() != GlobalValue::AppendingLinkage &&
	// We can't internalize available_externally globals because this
	// can break function pointer equality.
	S->linkage() != GlobalValue::AvailableExternallyLinkage &&
	// Functions and read-only variables with linkonce_odr and
	// weak_odr linkage can be internalized. We can't internalize
	// linkonce_odr and weak_odr variables which are both modified
	// and read somewhere in the program because reads and writes
	// will become inconsistent.
	!isWeakObjectWithRWAccess(S.get()))
	S->setLinkage(GlobalValue::InternalLinkage);
	}
	}

	// Update the linkages in the given \p Index to mark exported values
	// as external and non-exported values as internal.
	void llvm::thinLTOInternalizeAndPromoteInIndex(
	ModuleSummaryIndex &Index,
	function_ref<bool(StringRef, ValueInfo)> isExported,
	function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
	isPrevailing) {
	for (auto &I : Index)
	thinLTOInternalizeAndPromoteGUID(Index.getValueInfo(I), isExported,
	isPrevailing);
	}

	// Requires a destructor for std::vector<InputModule>.
	InputFile::~InputFile() = default;

	Expected<std::unique_ptr<InputFile>> InputFile::create(MemoryBufferRef Object) {
	std::unique_ptr<InputFile> File(new InputFile);

	Expected<IRSymtabFile> FOrErr = readIRSymtab(Object);
	if (!FOrErr)
	return FOrErr.takeError();

	File->TargetTriple = FOrErr->TheReader.getTargetTriple();
	File->SourceFileName = FOrErr->TheReader.getSourceFileName();
	File->COFFLinkerOpts = FOrErr->TheReader.getCOFFLinkerOpts();
	File->DependentLibraries = FOrErr->TheReader.getDependentLibraries();
	File->ComdatTable = FOrErr->TheReader.getComdatTable();

	for (unsigned I = 0; I != FOrErr->Mods.size(); ++I) {
	size_t Begin = File->Symbols.size();
	for (const irsymtab::Reader::SymbolRef &Sym :
	FOrErr->TheReader.module_symbols(I))
	// Skip symbols that are irrelevant to LTO. Note that this condition needs
	// to match the one in Skip() in LTO::addRegularLTO().
	if (Sym.isGlobal() && !Sym.isFormatSpecific())
	File->Symbols.push_back(Sym);
	File->ModuleSymIndices.push_back({Begin, File->Symbols.size()});
	}

	File->Mods = FOrErr->Mods;
	File->Strtab = std::move(FOrErr->Strtab);
	return std::move(File);
	}

	StringRef InputFile::getName() const {
	return Mods[0].getModuleIdentifier();
	}

	BitcodeModule &InputFile::getSingleBitcodeModule() {
	assert(Mods.size() == 1 && "Expect only one bitcode module");
	return Mods[0];
	}

	LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
	const Config &Conf)
	: ParallelCodeGenParallelismLevel(ParallelCodeGenParallelismLevel),
	Ctx(Conf), CombinedModule(std::make_unique<Module>("ld-temp.o", Ctx)),
	Mover(std::make_unique<IRMover>(*CombinedModule)) {}

	LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
	: Backend(Backend), CombinedIndex(/HaveGVs/ false) {
	if (!Backend)
	this->Backend =
	createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
	}

	LTO::LTO(Config Conf, ThinBackend Backend,
	unsigned ParallelCodeGenParallelismLevel)
	: Conf(std::move(Conf)),
	RegularLTO(ParallelCodeGenParallelismLevel, this->Conf),
	ThinLTO(std::move(Backend)) {}

	// Requires a destructor for MapVector<BitcodeModule>.
	LTO::~LTO() = default;

	// Add the symbols in the given module to the GlobalResolutions map, and resolve
	// their partitions.
	void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
	ArrayRef<SymbolResolution> Res,
	unsigned Partition, bool InSummary) {
	auto *ResI = Res.begin();
	auto *ResE = Res.end();
	(void)ResE;
	for (const InputFile::Symbol &Sym : Syms) {
	assert(ResI != ResE);
	SymbolResolution Res = *ResI++;

	StringRef Name = Sym.getName();
	Triple TT(RegularLTO.CombinedModule->getTargetTriple());
	// Strip the __imp_ prefix from COFF dllimport symbols (similar to the
	// way they are handled by lld), otherwise we can end up with two
	// global resolutions (one with and one for a copy of the symbol without).
	if (TT.isOSBinFormatCOFF() && Name.startswith("__imp_"))
	Name = Name.substr(strlen("__imp_"));
	auto &GlobalRes = GlobalResolutions[Name];
	GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
	if (Res.Prevailing) {
	assert(!GlobalRes.Prevailing &&
	"Multiple prevailing defs are not allowed");
	GlobalRes.Prevailing = true;
	GlobalRes.IRName = Sym.getIRName();
	} else if (!GlobalRes.Prevailing && GlobalRes.IRName.empty()) {
	// Sometimes it can be two copies of symbol in a module and prevailing
	// symbol can have no IR name. That might happen if symbol is defined in
	// module level inline asm block. In case we have multiple modules with
	// the same symbol we want to use IR name of the prevailing symbol.
	// Otherwise, if we haven't seen a prevailing symbol, set the name so that
	// we can later use it to check if there is any prevailing copy in IR.
	GlobalRes.IRName = Sym.getIRName();
	}

	// Set the partition to external if we know it is re-defined by the linker
	// with -defsym or -wrap options, used elsewhere, e.g. it is visible to a
	// regular object, is referenced from llvm.compiler_used, or was already
	// recorded as being referenced from a different partition.
	if (Res.LinkerRedefined \|\| Res.VisibleToRegularObj \|\| Sym.isUsed() \|\|
	(GlobalRes.Partition != GlobalResolution::Unknown &&
	GlobalRes.Partition != Partition)) {
	GlobalRes.Partition = GlobalResolution::External;
	} else
	// First recorded reference, save the current partition.
	GlobalRes.Partition = Partition;

	// Flag as visible outside of summary if visible from a regular object or
	// from a module that does not have a summary.
	GlobalRes.VisibleOutsideSummary \|=
	(Res.VisibleToRegularObj \|\| Sym.isUsed() \|\| !InSummary);
	}
	}

	static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
	ArrayRef<SymbolResolution> Res) {
	StringRef Path = Input->getName();
	OS << Path << '\n';
	auto ResI = Res.begin();
	for (const InputFile::Symbol &Sym : Input->symbols()) {
	assert(ResI != Res.end());
	SymbolResolution Res = *ResI++;

	OS << "-r=" << Path << ',' << Sym.getName() << ',';
	if (Res.Prevailing)
	OS << 'p';
	if (Res.FinalDefinitionInLinkageUnit)
	OS << 'l';
	if (Res.VisibleToRegularObj)
	OS << 'x';
	if (Res.LinkerRedefined)
	OS << 'r';
	OS << '\n';
	}
	OS.flush();
	assert(ResI == Res.end());
	}

	Error LTO::add(std::unique_ptr<InputFile> Input,
	ArrayRef<SymbolResolution> Res) {
	assert(!CalledGetMaxTasks);

	if (Conf.ResolutionFile)
	writeToResolutionFile(*Conf.ResolutionFile, Input.get(), Res);

	if (RegularLTO.CombinedModule->getTargetTriple().empty())
	RegularLTO.CombinedModule->setTargetTriple(Input->getTargetTriple());

	const SymbolResolution *ResI = Res.begin();
	for (unsigned I = 0; I != Input->Mods.size(); ++I)
	if (Error Err = addModule(*Input, I, ResI, Res.end()))
	return Err;

	assert(ResI == Res.end());
	return Error::success();
	}

	Error LTO::addModule(InputFile &Input, unsigned ModI,
	const SymbolResolution *&ResI,
	const SymbolResolution *ResE) {
	Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo();
	if (!LTOInfo)
	return LTOInfo.takeError();

	if (EnableSplitLTOUnit.hasValue()) {
	// If only some modules were split, flag this in the index so that
	// we can skip or error on optimizations that need consistently split
	// modules (whole program devirt and lower type tests).
	if (EnableSplitLTOUnit.getValue() != LTOInfo->EnableSplitLTOUnit)
	ThinLTO.CombinedIndex.setPartiallySplitLTOUnits();
	} else
	EnableSplitLTOUnit = LTOInfo->EnableSplitLTOUnit;

	BitcodeModule BM = Input.Mods[ModI];
	auto ModSyms = Input.module_symbols(ModI);
	addModuleToGlobalRes(ModSyms, {ResI, ResE},
	LTOInfo->IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0,
	LTOInfo->HasSummary);

	if (LTOInfo->IsThinLTO)
	return addThinLTO(BM, ModSyms, ResI, ResE);

	Expected<RegularLTOState::AddedModule> ModOrErr =
	addRegularLTO(BM, ModSyms, ResI, ResE);
	if (!ModOrErr)
	return ModOrErr.takeError();

	if (!LTOInfo->HasSummary)
	return linkRegularLTO(std::move(ModOrErr), /LivenessFromIndex=*/false);

	// Regular LTO module summaries are added to a dummy module that represents
	// the combined regular LTO module.
	if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, "", -1ull))
	return Err;
	RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr));
	return Error::success();
	}

	// Checks whether the given global value is in a non-prevailing comdat
	// (comdat containing values the linker indicated were not prevailing,
	// which we then dropped to available_externally), and if so, removes
	// it from the comdat. This is called for all global values to ensure the
	// comdat is empty rather than leaving an incomplete comdat. It is needed for
	// regular LTO modules, in case we are in a mixed-LTO mode (both regular
	// and thin LTO modules) compilation. Since the regular LTO module will be
	// linked first in the final native link, we want to make sure the linker
	// doesn't select any of these incomplete comdats that would be left
	// in the regular LTO module without this cleanup.
	static void
	handleNonPrevailingComdat(GlobalValue &GV,
	std::set<const Comdat *> &NonPrevailingComdats) {
	Comdat *C = GV.getComdat();
	if (!C)
	return;

	if (!NonPrevailingComdats.count(C))
	return;

	// Additionally need to drop externally visible global values from the comdat
	// to available_externally, so that there aren't multiply defined linker
	// errors.
	if (!GV.hasLocalLinkage())
	GV.setLinkage(GlobalValue::AvailableExternallyLinkage);

	if (auto GO = dyn_cast<GlobalObject>(&GV))
	GO->setComdat(nullptr);
	}

	// Add a regular LTO object to the link.
	// The resulting module needs to be linked into the combined LTO module with
	// linkRegularLTO.
	Expected<LTO::RegularLTOState::AddedModule>
	LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
	const SymbolResolution *&ResI,
	const SymbolResolution *ResE) {
	RegularLTOState::AddedModule Mod;
	Expected<std::unique_ptr<Module>> MOrErr =
	BM.getLazyModule(RegularLTO.Ctx, /ShouldLazyLoadMetadata/ true,
	/IsImporting/ false);
	if (!MOrErr)
	return MOrErr.takeError();
	Module &M = **MOrErr;
	Mod.M = std::move(*MOrErr);

	if (Error Err = M.materializeMetadata())
	return std::move(Err);
	UpgradeDebugInfo(M);

	ModuleSymbolTable SymTab;
	SymTab.addModule(&M);

	for (GlobalVariable &GV : M.globals())
	if (GV.hasAppendingLinkage())
	Mod.Keep.push_back(&GV);

	DenseSet<GlobalObject *> AliasedGlobals;
	for (auto &GA : M.aliases())
	if (GlobalObject *GO = GA.getBaseObject())
	AliasedGlobals.insert(GO);

	// In this function we need IR GlobalValues matching the symbols in Syms
	// (which is not backed by a module), so we need to enumerate them in the same
	// order. The symbol enumeration order of a ModuleSymbolTable intentionally
	// matches the order of an irsymtab, but when we read the irsymtab in
	// InputFile::create we omit some symbols that are irrelevant to LTO. The
	// Skip() function skips the same symbols from the module as InputFile does
	// from the symbol table.
	auto MsymI = SymTab.symbols().begin(), MsymE = SymTab.symbols().end();
	auto Skip = [&]() {
	while (MsymI != MsymE) {
	auto Flags = SymTab.getSymbolFlags(*MsymI);
	if ((Flags & object::BasicSymbolRef::SF_Global) &&
	!(Flags & object::BasicSymbolRef::SF_FormatSpecific))
	return;
	++MsymI;
	}
	};
	Skip();

	std::set<const Comdat *> NonPrevailingComdats;
	for (const InputFile::Symbol &Sym : Syms) {
	assert(ResI != ResE);
	SymbolResolution Res = *ResI++;

	assert(MsymI != MsymE);
	ModuleSymbolTable::Symbol Msym = *MsymI++;
	Skip();

	if (GlobalValue GV = Msym.dyn_cast<GlobalValue >()) {
	if (Res.Prevailing) {
	if (Sym.isUndefined())
	continue;
	Mod.Keep.push_back(GV);
	// For symbols re-defined with linker -wrap and -defsym options,
	// set the linkage to weak to inhibit IPO. The linkage will be
	// restored by the linker.
	if (Res.LinkerRedefined)
	GV->setLinkage(GlobalValue::WeakAnyLinkage);

	GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage();
	if (GlobalValue::isLinkOnceLinkage(OriginalLinkage))
	GV->setLinkage(GlobalValue::getWeakLinkage(
	GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
	} else if (isa<GlobalObject>(GV) &&
	(GV->hasLinkOnceODRLinkage() \|\| GV->hasWeakODRLinkage() \|\|
	GV->hasAvailableExternallyLinkage()) &&
	!AliasedGlobals.count(cast<GlobalObject>(GV))) {
	// Any of the above three types of linkage indicates that the
	// chosen prevailing symbol will have the same semantics as this copy of
	// the symbol, so we may be able to link it with available_externally
	// linkage. We will decide later whether to do that when we link this
	// module (in linkRegularLTO), based on whether it is undefined.
	Mod.Keep.push_back(GV);
	GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
	if (GV->hasComdat())
	NonPrevailingComdats.insert(GV->getComdat());
	cast<GlobalObject>(GV)->setComdat(nullptr);
	}

	// Set the 'local' flag based on the linker resolution for this symbol.
	if (Res.FinalDefinitionInLinkageUnit) {
	GV->setDSOLocal(true);
	if (GV->hasDLLImportStorageClass())
	GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes::
	DefaultStorageClass);
	}
	}
	// Common resolution: collect the maximum size/alignment over all commons.
	// We also record if we see an instance of a common as prevailing, so that
	// if none is prevailing we can ignore it later.
	if (Sym.isCommon()) {
	// FIXME: We should figure out what to do about commons defined by asm.
	// For now they aren't reported correctly by ModuleSymbolTable.
	auto &CommonRes = RegularLTO.Commons[Sym.getIRName()];
	CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
	CommonRes.Align =
	std::max(CommonRes.Align, MaybeAlign(Sym.getCommonAlignment()));
	CommonRes.Prevailing \|= Res.Prevailing;
	}

	}
	if (!M.getComdatSymbolTable().empty())
	for (GlobalValue &GV : M.global_values())
	handleNonPrevailingComdat(GV, NonPrevailingComdats);
	assert(MsymI == MsymE);
	return std::move(Mod);
	}

	Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
	bool LivenessFromIndex) {
	std::vector<GlobalValue *> Keep;
	for (GlobalValue *GV : Mod.Keep) {
	if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID()))
	continue;

	if (!GV->hasAvailableExternallyLinkage()) {
	Keep.push_back(GV);
	continue;
	}

	// Only link available_externally definitions if we don't already have a
	// definition.
	GlobalValue *CombinedGV =
	RegularLTO.CombinedModule->getNamedValue(GV->getName());
	if (CombinedGV && !CombinedGV->isDeclaration())
	continue;

	Keep.push_back(GV);
	}

	return RegularLTO.Mover->move(std::move(Mod.M), Keep,
	[](GlobalValue &, IRMover::ValueAdder) {},
	/* IsPerformingImport */ false);
	}

	// Add a ThinLTO module to the link.
	Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
	const SymbolResolution *&ResI,
	const SymbolResolution *ResE) {
	if (Error Err =
	BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
	ThinLTO.ModuleMap.size()))
	return Err;

	for (const InputFile::Symbol &Sym : Syms) {
	assert(ResI != ResE);
	SymbolResolution Res = *ResI++;

	if (!Sym.getIRName().empty()) {
	auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
	Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
	if (Res.Prevailing) {
	ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();

	// For linker redefined symbols (via --wrap or --defsym) we want to
	// switch the linkage to `weak` to prevent IPOs from happening.
	// Find the summary in the module for this very GV and record the new
	// linkage so that we can switch it when we import the GV.
	if (Res.LinkerRedefined)
	if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
	GUID, BM.getModuleIdentifier()))
	S->setLinkage(GlobalValue::WeakAnyLinkage);
	}

	// If the linker resolved the symbol to a local definition then mark it
	// as local in the summary for the module we are adding.
	if (Res.FinalDefinitionInLinkageUnit) {
	if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
	GUID, BM.getModuleIdentifier())) {
	S->setDSOLocal(true);
	}
	}
	}
	}

	if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
	return make_error<StringError>(
	"Expected at most one ThinLTO module per bitcode file",
	inconvertibleErrorCode());

	return Error::success();
	}

	unsigned LTO::getMaxTasks() const {
	CalledGetMaxTasks = true;
	return RegularLTO.ParallelCodeGenParallelismLevel + ThinLTO.ModuleMap.size();
	}

	// If only some of the modules were split, we cannot correctly handle
	// code that contains type tests or type checked loads.
	Error LTO::checkPartiallySplit() {
	if (!ThinLTO.CombinedIndex.partiallySplitLTOUnits())
	return Error::success();

	Function *TypeTestFunc = RegularLTO.CombinedModule->getFunction(
	Intrinsic::getName(Intrinsic::type_test));
	Function *TypeCheckedLoadFunc = RegularLTO.CombinedModule->getFunction(
	Intrinsic::getName(Intrinsic::type_checked_load));

	// First check if there are type tests / type checked loads in the
	// merged regular LTO module IR.
	if ((TypeTestFunc && !TypeTestFunc->use_empty()) \|\|
	(TypeCheckedLoadFunc && !TypeCheckedLoadFunc->use_empty()))
	return make_error<StringError>(
	"inconsistent LTO Unit splitting (recompile with -fsplit-lto-unit)",
	inconvertibleErrorCode());

	// Otherwise check if there are any recorded in the combined summary from the
	// ThinLTO modules.
	for (auto &P : ThinLTO.CombinedIndex) {
	for (auto &S : P.second.SummaryList) {
	auto *FS = dyn_cast<FunctionSummary>(S.get());
	if (!FS)
	continue;
	if (!FS->type_test_assume_vcalls().empty() \|\|
	!FS->type_checked_load_vcalls().empty() \|\|
	!FS->type_test_assume_const_vcalls().empty() \|\|
	!FS->type_checked_load_const_vcalls().empty() \|\|
	!FS->type_tests().empty())
	return make_error<StringError>(
	"inconsistent LTO Unit splitting (recompile with -fsplit-lto-unit)",
	inconvertibleErrorCode());
	}
	}
	return Error::success();
	}

	Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
	// Compute "dead" symbols, we don't want to import/export these!
	DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
	DenseMap<GlobalValue::GUID, PrevailingType> GUIDPrevailingResolutions;
	for (auto &Res : GlobalResolutions) {
	// Normally resolution have IR name of symbol. We can do nothing here
	// otherwise. See comments in GlobalResolution struct for more details.
	if (Res.second.IRName.empty())
	continue;

	GlobalValue::GUID GUID = GlobalValue::getGUID(
	GlobalValue::dropLLVMManglingEscape(Res.second.IRName));

	if (Res.second.VisibleOutsideSummary && Res.second.Prevailing)
	GUIDPreservedSymbols.insert(GUID);

	GUIDPrevailingResolutions[GUID] =
	Res.second.Prevailing ? PrevailingType::Yes : PrevailingType::No;
	}

	auto isPrevailing = [&](GlobalValue::GUID G) {
	auto It = GUIDPrevailingResolutions.find(G);
	if (It == GUIDPrevailingResolutions.end())
	return PrevailingType::Unknown;
	return It->second;
	};
	computeDeadSymbolsWithConstProp(ThinLTO.CombinedIndex, GUIDPreservedSymbols,
	isPrevailing, Conf.OptLevel > 0);

	// Setup output file to emit statistics.
	auto StatsFileOrErr = setupStatsFile(Conf.StatsFile);
	if (!StatsFileOrErr)
	return StatsFileOrErr.takeError();
	std::unique_ptr<ToolOutputFile> StatsFile = std::move(StatsFileOrErr.get());

	// Finalize linking of regular LTO modules containing summaries now that
	// we have computed liveness information.
	for (auto &M : RegularLTO.ModsWithSummaries)
	if (Error Err = linkRegularLTO(std::move(M),
	/LivenessFromIndex=/true))
	return Err;

	// Ensure we don't have inconsistently split LTO units with type tests.
	if (Error Err = checkPartiallySplit())
	return Err;

	Error Result = runRegularLTO(AddStream);
	if (!Result)
	Result = runThinLTO(AddStream, Cache, GUIDPreservedSymbols);

	if (StatsFile)
	PrintStatisticsJSON(StatsFile->os());

	return Result;
	}

	Error LTO::runRegularLTO(AddStreamFn AddStream) {
	// Make sure commons have the right size/alignment: we kept the largest from
	// all the prevailing when adding the inputs, and we apply it here.
	const DataLayout &DL = RegularLTO.CombinedModule->getDataLayout();
	for (auto &I : RegularLTO.Commons) {
	if (!I.second.Prevailing)
	// Don't do anything if no instance of this common was prevailing.
	continue;
	GlobalVariable *OldGV = RegularLTO.CombinedModule->getNamedGlobal(I.first);
	if (OldGV && DL.getTypeAllocSize(OldGV->getValueType()) == I.second.Size) {
	// Don't create a new global if the type is already correct, just make
	// sure the alignment is correct.
	OldGV->setAlignment(I.second.Align);
	continue;
	}
	ArrayType *Ty =
	ArrayType::get(Type::getInt8Ty(RegularLTO.Ctx), I.second.Size);
	auto GV = new GlobalVariable(RegularLTO.CombinedModule, Ty, false,
	GlobalValue::CommonLinkage,
	ConstantAggregateZero::get(Ty), "");
	GV->setAlignment(I.second.Align);
	if (OldGV) {
	OldGV->replaceAllUsesWith(ConstantExpr::getBitCast(GV, OldGV->getType()));
	GV->takeName(OldGV);
	OldGV->eraseFromParent();
	} else {
	GV->setName(I.first);
	}
	}

	if (Conf.PreOptModuleHook &&
	!Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
	return Error::success();

	if (!Conf.CodeGenOnly) {
	for (const auto &R : GlobalResolutions) {
	if (!R.second.isPrevailingIRSymbol())
	continue;
	if (R.second.Partition != 0 &&
	R.second.Partition != GlobalResolution::External)
	continue;

	GlobalValue *GV =
	RegularLTO.CombinedModule->getNamedValue(R.second.IRName);
	// Ignore symbols defined in other partitions.
	// Also skip declarations, which are not allowed to have internal linkage.
	if (!GV \|\| GV->hasLocalLinkage() \|\| GV->isDeclaration())
	continue;
	GV->setUnnamedAddr(R.second.UnnamedAddr ? GlobalValue::UnnamedAddr::Global
	: GlobalValue::UnnamedAddr::None);
	if (EnableLTOInternalization && R.second.Partition == 0)
	GV->setLinkage(GlobalValue::InternalLinkage);
	}

	RegularLTO.CombinedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);

	if (Conf.PostInternalizeModuleHook &&
	!Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule))
	return Error::success();
	}
	return backend(Conf, AddStream, RegularLTO.ParallelCodeGenParallelismLevel,
	std::move(RegularLTO.CombinedModule), ThinLTO.CombinedIndex);
	}

	static const char *libcallRoutineNames[] = {
	#define HANDLE_LIBCALL(code, name) name,
	#include "llvm/IR/RuntimeLibcalls.def"
	#undef HANDLE_LIBCALL
	};

	ArrayRef<const char*> LTO::getRuntimeLibcallSymbols() {
	return makeArrayRef(libcallRoutineNames);
	}

	/// This class defines the interface to the ThinLTO backend.
	class lto::ThinBackendProc {
	protected:
	const Config &Conf;
	ModuleSummaryIndex &CombinedIndex;
	const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries;

	public:
	ThinBackendProc(const Config &Conf, ModuleSummaryIndex &CombinedIndex,
	const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries)
	: Conf(Conf), CombinedIndex(CombinedIndex),
	ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries) {}

	virtual ~ThinBackendProc() {}
	virtual Error start(
	unsigned Task, BitcodeModule BM,
	const FunctionImporter::ImportMapTy &ImportList,
	const FunctionImporter::ExportSetTy &ExportList,
	const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
	MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
	virtual Error wait() = 0;
	};

	namespace {
	class InProcessThinBackend : public ThinBackendProc {
	ThreadPool BackendThreadPool;
	AddStreamFn AddStream;
	NativeObjectCache Cache;
	std::set<GlobalValue::GUID> CfiFunctionDefs;
	std::set<GlobalValue::GUID> CfiFunctionDecls;

	Optional<Error> Err;
	std::mutex ErrMu;

	public:
	InProcessThinBackend(
	const Config &Conf, ModuleSummaryIndex &CombinedIndex,
	unsigned ThinLTOParallelismLevel,
	const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
	AddStreamFn AddStream, NativeObjectCache Cache)
	: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
	BackendThreadPool(ThinLTOParallelismLevel),
	AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
	for (auto &Name : CombinedIndex.cfiFunctionDefs())
	CfiFunctionDefs.insert(
	GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
	for (auto &Name : CombinedIndex.cfiFunctionDecls())
	CfiFunctionDecls.insert(
	GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
	}

	Error runThinLTOBackendThread(
	AddStreamFn AddStream, NativeObjectCache Cache, unsigned Task,
	BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
	const FunctionImporter::ImportMapTy &ImportList,
	const FunctionImporter::ExportSetTy &ExportList,
	const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
	const GVSummaryMapTy &DefinedGlobals,
	MapVector<StringRef, BitcodeModule> &ModuleMap) {
	auto RunThinBackend = [&](AddStreamFn AddStream) {
	LTOLLVMContext BackendContext(Conf);
	Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
	if (!MOrErr)
	return MOrErr.takeError();

	return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
	ImportList, DefinedGlobals, ModuleMap);
	};

	auto ModuleID = BM.getModuleIdentifier();

	if (!Cache \|\| !CombinedIndex.modulePaths().count(ModuleID) \|\|
	all_of(CombinedIndex.getModuleHash(ModuleID),
	[](uint32_t V) { return V == 0; }))
	// Cache disabled or no entry for this module in the combined index or
	// no module hash.
	return RunThinBackend(AddStream);

	SmallString<40> Key;
	// The module may be cached, this helps handling it.
	computeLTOCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList,
	ExportList, ResolvedODR, DefinedGlobals, CfiFunctionDefs,
	CfiFunctionDecls);
	if (AddStreamFn CacheAddStream = Cache(Task, Key))
	return RunThinBackend(CacheAddStream);

	return Error::success();
	}

	Error start(
	unsigned Task, BitcodeModule BM,
	const FunctionImporter::ImportMapTy &ImportList,
	const FunctionImporter::ExportSetTy &ExportList,
	const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
	MapVector<StringRef, BitcodeModule> &ModuleMap) override {
	StringRef ModulePath = BM.getModuleIdentifier();
	assert(ModuleToDefinedGVSummaries.count(ModulePath));
	const GVSummaryMapTy &DefinedGlobals =
	ModuleToDefinedGVSummaries.find(ModulePath)->second;
	BackendThreadPool.async(
	[=](BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
	const FunctionImporter::ImportMapTy &ImportList,
	const FunctionImporter::ExportSetTy &ExportList,
	const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
	&ResolvedODR,
	const GVSummaryMapTy &DefinedGlobals,
	MapVector<StringRef, BitcodeModule> &ModuleMap) {
	Error E = runThinLTOBackendThread(
	AddStream, Cache, Task, BM, CombinedIndex, ImportList, ExportList,
	ResolvedODR, DefinedGlobals, ModuleMap);
	if (E) {
	std::unique_lock<std::mutex> L(ErrMu);
	if (Err)
	Err = joinErrors(std::move(*Err), std::move(E));
	else
	Err = std::move(E);
	}
	},
	BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
	std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap));
	return Error::success();
	}

	Error wait() override {
	BackendThreadPool.wait();
	if (Err)
	return std::move(*Err);
	else
	return Error::success();
	}
	};
	} // end anonymous namespace

	ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
	return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
	const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
	AddStreamFn AddStream, NativeObjectCache Cache) {
	return std::make_unique<InProcessThinBackend>(
	Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
	AddStream, Cache);
	};
	}

	// Given the original \p Path to an output file, replace any path
	// prefix matching \p OldPrefix with \p NewPrefix. Also, create the
	// resulting directory if it does not yet exist.
	std::string lto::getThinLTOOutputFile(const std::string &Path,
	const std::string &OldPrefix,
	const std::string &NewPrefix) {
	if (OldPrefix.empty() && NewPrefix.empty())
	return Path;
	SmallString<128> NewPath(Path);
	llvm::sys::path::replace_path_prefix(NewPath, OldPrefix, NewPrefix);
	StringRef ParentPath = llvm::sys::path::parent_path(NewPath.str());
	if (!ParentPath.empty()) {
	// Make sure the new directory exists, creating it if necessary.
	if (std::error_code EC = llvm::sys::fs::create_directories(ParentPath))
	llvm::errs() << "warning: could not create directory '" << ParentPath
	<< "': " << EC.message() << '\n';
	}
	return NewPath.str();
	}

	namespace {
	class WriteIndexesThinBackend : public ThinBackendProc {
	std::string OldPrefix, NewPrefix;
	bool ShouldEmitImportsFiles;
	raw_fd_ostream *LinkedObjectsFile;
	lto::IndexWriteCallback OnWrite;

	public:
	WriteIndexesThinBackend(
	const Config &Conf, ModuleSummaryIndex &CombinedIndex,
	const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
	std::string OldPrefix, std::string NewPrefix, bool ShouldEmitImportsFiles,
	raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
	: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
	OldPrefix(OldPrefix), NewPrefix(NewPrefix),
	ShouldEmitImportsFiles(ShouldEmitImportsFiles),
	LinkedObjectsFile(LinkedObjectsFile), OnWrite(OnWrite) {}

	Error start(
	unsigned Task, BitcodeModule BM,
	const FunctionImporter::ImportMapTy &ImportList,
	const FunctionImporter::ExportSetTy &ExportList,
	const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
	MapVector<StringRef, BitcodeModule> &ModuleMap) override {
	StringRef ModulePath = BM.getModuleIdentifier();
	std::string NewModulePath =
	getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);

	if (LinkedObjectsFile)
	*LinkedObjectsFile << NewModulePath << '\n';

	std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
	gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
	ImportList, ModuleToSummariesForIndex);

	std::error_code EC;
	raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
	sys::fs::OpenFlags::OF_None);
	if (EC)
	return errorCodeToError(EC);
	WriteIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);

	if (ShouldEmitImportsFiles) {
	EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
	ModuleToSummariesForIndex);
	if (EC)
	return errorCodeToError(EC);
	}

	if (OnWrite)
	OnWrite(ModulePath);
	return Error::success();
	}

	Error wait() override { return Error::success(); }
	};
	} // end anonymous namespace

	ThinBackend lto::createWriteIndexesThinBackend(
	std::string OldPrefix, std::string NewPrefix, bool ShouldEmitImportsFiles,
	raw_fd_ostream *LinkedObjectsFile, IndexWriteCallback OnWrite) {
	return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
	const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
	AddStreamFn AddStream, NativeObjectCache Cache) {
	return std::make_unique<WriteIndexesThinBackend>(
	Conf, CombinedIndex, ModuleToDefinedGVSummaries, OldPrefix, NewPrefix,
	ShouldEmitImportsFiles, LinkedObjectsFile, OnWrite);
	};
	}

	Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
	const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
	if (ThinLTO.ModuleMap.empty())
	return Error::success();

	if (Conf.CombinedIndexHook &&
	!Conf.CombinedIndexHook(ThinLTO.CombinedIndex, GUIDPreservedSymbols))
	return Error::success();

	// Collect for each module the list of function it defines (GUID ->
	// Summary).
	StringMap<GVSummaryMapTy>
	ModuleToDefinedGVSummaries(ThinLTO.ModuleMap.size());
	ThinLTO.CombinedIndex.collectDefinedGVSummariesPerModule(
	ModuleToDefinedGVSummaries);
	// Create entries for any modules that didn't have any GV summaries
	// (either they didn't have any GVs to start with, or we suppressed
	// generation of the summaries because they e.g. had inline assembly
	// uses that couldn't be promoted/renamed on export). This is so
	// InProcessThinBackend::start can still launch a backend thread, which
	// is passed the map of summaries for the module, without any special
	// handling for this case.
	for (auto &Mod : ThinLTO.ModuleMap)
	if (!ModuleToDefinedGVSummaries.count(Mod.first))
	ModuleToDefinedGVSummaries.try_emplace(Mod.first);

	// Synthesize entry counts for functions in the CombinedIndex.
	computeSyntheticCounts(ThinLTO.CombinedIndex);

	StringMap<FunctionImporter::ImportMapTy> ImportLists(
	ThinLTO.ModuleMap.size());
	StringMap<FunctionImporter::ExportSetTy> ExportLists(
	ThinLTO.ModuleMap.size());
	StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;

	if (DumpThinCGSCCs)
	ThinLTO.CombinedIndex.dumpSCCs(outs());

	std::set<GlobalValue::GUID> ExportedGUIDs;

	// Perform index-based WPD. This will return immediately if there are
	// no index entries in the typeIdMetadata map (e.g. if we are instead
	// performing IR-based WPD in hybrid regular/thin LTO mode).
	std::map<ValueInfo, std::vector<VTableSlotSummary>> LocalWPDTargetsMap;
	runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs,
	LocalWPDTargetsMap);

	if (Conf.OptLevel > 0)
	ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
	ImportLists, ExportLists);

	// Figure out which symbols need to be internalized. This also needs to happen
	// at -O0 because summary-based DCE is implemented using internalization, and
	// we must apply DCE consistently with the full LTO module in order to avoid
	// undefined references during the final link.
	for (auto &Res : GlobalResolutions) {
	// If the symbol does not have external references or it is not prevailing,
	// then not need to mark it as exported from a ThinLTO partition.
	if (Res.second.Partition != GlobalResolution::External \|\|
	!Res.second.isPrevailingIRSymbol())
	continue;
	auto GUID = GlobalValue::getGUID(
	GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
	// Mark exported unless index-based analysis determined it to be dead.
	if (ThinLTO.CombinedIndex.isGUIDLive(GUID))
	ExportedGUIDs.insert(GUID);
	}

	// Any functions referenced by the jump table in the regular LTO object must
	// be exported.
	for (auto &Def : ThinLTO.CombinedIndex.cfiFunctionDefs())
	ExportedGUIDs.insert(
	GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Def)));

	auto isExported = [&](StringRef ModuleIdentifier, ValueInfo VI) {
	const auto &ExportList = ExportLists.find(ModuleIdentifier);
	return (ExportList != ExportLists.end() && ExportList->second.count(VI)) \|\|
	ExportedGUIDs.count(VI.getGUID());
	};

	// Update local devirtualized targets that were exported by cross-module
	// importing or by other devirtualizations marked in the ExportedGUIDs set.
	updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported,
	LocalWPDTargetsMap);

	auto isPrevailing = [&](GlobalValue::GUID GUID,
	const GlobalValueSummary *S) {
	return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
	};
	thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported,
	isPrevailing);

	auto recordNewLinkage = [&](StringRef ModuleIdentifier,
	GlobalValue::GUID GUID,
	GlobalValue::LinkageTypes NewLinkage) {
	ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
	};
	thinLTOResolvePrevailingInIndex(ThinLTO.CombinedIndex, isPrevailing,
	recordNewLinkage, GUIDPreservedSymbols);

	std::unique_ptr<ThinBackendProc> BackendProc =
	ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
	AddStream, Cache);

	// Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
	// module and parallel code generation partitions.
	unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
	for (auto &Mod : ThinLTO.ModuleMap) {
	if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
	ExportLists[Mod.first],
	ResolvedODR[Mod.first], ThinLTO.ModuleMap))
	return E;
	++Task;
	}

	return BackendProc->wait();
	}

	Expected<std::unique_ptr<ToolOutputFile>>
	lto::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
	StringRef RemarksPasses, StringRef RemarksFormat,
	bool RemarksWithHotness, int Count) {
	std::string Filename = RemarksFilename;
	// For ThinLTO, file.opt.<format> becomes
	// file.opt.<format>.thin.<num>.<format>.
	if (!Filename.empty() && Count != -1)
	Filename =
	(Twine(Filename) + ".thin." + llvm::utostr(Count) + "." + RemarksFormat)
	.str();

	auto ResultOrErr = llvm::setupOptimizationRemarks(
	Context, Filename, RemarksPasses, RemarksFormat, RemarksWithHotness);
	if (Error E = ResultOrErr.takeError())
	return std::move(E);

	if (*ResultOrErr)
	(*ResultOrErr)->keep();

	return ResultOrErr;
	}

	Expected<std::unique_ptr<ToolOutputFile>>
	lto::setupStatsFile(StringRef StatsFilename) {
	// Setup output file to emit statistics.
	if (StatsFilename.empty())
	return nullptr;

	llvm::EnableStatistics(false);
	std::error_code EC;
	auto StatsFile =
	std::make_unique<ToolOutputFile>(StatsFilename, EC, sys::fs::OF_None);
	if (EC)
	return errorCodeToError(EC);

	StatsFile->keep();
	return std::move(StatsFile);
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp (revision 362609)
	@@ -1,503 +1,503 @@
	//===-- BPFISelDAGToDAG.cpp - A dag to dag inst selector for BPF ----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines a DAG pattern matching instruction selector for BPF,
	// converting from a legalized dag to a BPF dag.
	//
	//===----------------------------------------------------------------------===//

	#include "BPF.h"
	#include "BPFRegisterInfo.h"
	#include "BPFSubtarget.h"
	#include "BPFTargetMachine.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/IntrinsicsBPF.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"

	using namespace llvm;

	#define DEBUG_TYPE "bpf-isel"

	// Instruction Selector Implementation
	namespace {

	class BPFDAGToDAGISel : public SelectionDAGISel {

	/// Subtarget - Keep a pointer to the BPFSubtarget around so that we can
	/// make the right decision when generating code for different subtargets.
	const BPFSubtarget *Subtarget;

	public:
	explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
	: SelectionDAGISel(TM), Subtarget(nullptr) {}

	StringRef getPassName() const override {
	return "BPF DAG->DAG Pattern Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	// Reset the subtarget each time through.
	Subtarget = &MF.getSubtarget<BPFSubtarget>();
	return SelectionDAGISel::runOnMachineFunction(MF);
	}

	void PreprocessISelDAG() override;

	bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
	std::vector<SDValue> &OutOps) override;


	private:
	// Include the pieces autogenerated from the target description.
	#include "BPFGenDAGISel.inc"

	void Select(SDNode *N) override;

	// Complex Pattern for address selection.
	bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
	bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);

	// Node preprocessing cases
	void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator &I);
	void PreprocessCopyToReg(SDNode *Node);
	void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator &I);

	// Find constants from a constant structure
	typedef std::vector<unsigned char> val_vec_type;
	bool fillGenericConstant(const DataLayout &DL, const Constant *CV,
	val_vec_type &Vals, uint64_t Offset);
	bool fillConstantDataArray(const DataLayout &DL, const ConstantDataArray *CDA,
	val_vec_type &Vals, int Offset);
	bool fillConstantArray(const DataLayout &DL, const ConstantArray *CA,
	val_vec_type &Vals, int Offset);
	bool fillConstantStruct(const DataLayout &DL, const ConstantStruct *CS,
	val_vec_type &Vals, int Offset);
	bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
	uint64_t Size, unsigned char *ByteSeq);
	// Mapping from ConstantStruct global value to corresponding byte-list values
	std::map<const void *, val_vec_type> cs_vals_;
	};
	} // namespace

	// ComplexPattern used on BPF Load/Store instructions
	bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
	// if Address is FI, get the TargetFrameIndex.
	SDLoc DL(Addr);
	if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
	Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
	Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
	return true;
	}

	if (Addr.getOpcode() == ISD::TargetExternalSymbol \|\|
	Addr.getOpcode() == ISD::TargetGlobalAddress)
	return false;

	// Addresses of the form Addr+const or Addr\|const
	if (CurDAG->isBaseWithConstantOffset(Addr)) {
	ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
	if (isInt<16>(CN->getSExtValue())) {

	// If the first operand is a FI, get the TargetFI Node
	if (FrameIndexSDNode *FIN =
	dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
	Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
	else
	Base = Addr.getOperand(0);

	Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i64);
	return true;
	}
	}

	Base = Addr;
	Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
	return true;
	}

	// ComplexPattern used on BPF FI instruction
	bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base,
	SDValue &Offset) {
	SDLoc DL(Addr);

	if (!CurDAG->isBaseWithConstantOffset(Addr))
	return false;

	// Addresses of the form Addr+const or Addr\|const
	ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
	if (isInt<16>(CN->getSExtValue())) {

	// If the first operand is a FI, get the TargetFI Node
	if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
	Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
	else
	return false;

	Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i64);
	return true;
	}

	return false;
	}

	bool BPFDAGToDAGISel::SelectInlineAsmMemoryOperand(
	const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) {
	SDValue Op0, Op1;
	switch (ConstraintCode) {
	default:
	return true;
	case InlineAsm::Constraint_m: // memory
	if (!SelectAddr(Op, Op0, Op1))
	return true;
	break;
	}

	SDLoc DL(Op);
	SDValue AluOp = CurDAG->getTargetConstant(ISD::ADD, DL, MVT::i32);;
	OutOps.push_back(Op0);
	OutOps.push_back(Op1);
	OutOps.push_back(AluOp);
	return false;
	}

	void BPFDAGToDAGISel::Select(SDNode *Node) {
	unsigned Opcode = Node->getOpcode();

	// If we have a custom node, we already have selected!
	if (Node->isMachineOpcode()) {
	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
	return;
	}

	// tablegen selection should be handled here.
	switch (Opcode) {
	default:
	break;
	case ISD::SDIV: {
	DebugLoc Empty;
	const DebugLoc &DL = Node->getDebugLoc();
	if (DL != Empty)
	errs() << "Error at line " << DL.getLine() << ": ";
	else
	errs() << "Error: ";
	errs() << "Unsupport signed division for DAG: ";
	Node->print(errs(), CurDAG);
	errs() << "Please convert to unsigned div/mod.\n";
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
	switch (IntNo) {
	case Intrinsic::bpf_load_byte:
	case Intrinsic::bpf_load_half:
	case Intrinsic::bpf_load_word: {
	SDLoc DL(Node);
	SDValue Chain = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);
	SDValue Skb = Node->getOperand(2);
	SDValue N3 = Node->getOperand(3);

	SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64);
	Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue());
	Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3);
	break;
	}
	}
	break;
	}

	case ISD::FrameIndex: {
	int FI = cast<FrameIndexSDNode>(Node)->getIndex();
	EVT VT = Node->getValueType(0);
	SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
	unsigned Opc = BPF::MOV_rr;
	if (Node->hasOneUse()) {
	CurDAG->SelectNodeTo(Node, Opc, VT, TFI);
	return;
	}
	ReplaceNode(Node, CurDAG->getMachineNode(Opc, SDLoc(Node), VT, TFI));
	return;
	}
	}

	// Select the default instruction
	SelectCode(Node);
	}

	void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
	SelectionDAG::allnodes_iterator &I) {
	union {
	uint8_t c[8];
	uint16_t s;
	uint32_t i;
	uint64_t d;
	} new_val; // hold up the constant values replacing loads.
	bool to_replace = false;
	SDLoc DL(Node);
	const LoadSDNode *LD = cast<LoadSDNode>(Node);
	uint64_t size = LD->getMemOperand()->getSize();

	if (!size \|\| size > 8 \|\| (size & (size - 1)))
	return;

	SDNode *LDAddrNode = LD->getOperand(1).getNode();
	// Match LDAddr against either global_addr or (global_addr + offset)
	unsigned opcode = LDAddrNode->getOpcode();
	if (opcode == ISD::ADD) {
	SDValue OP1 = LDAddrNode->getOperand(0);
	SDValue OP2 = LDAddrNode->getOperand(1);

	// We want to find the pattern global_addr + offset
	SDNode *OP1N = OP1.getNode();
	if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END \|\| OP1N->getNumOperands() == 0)
	return;

	LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');

	const GlobalAddressSDNode *GADN =
	dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
	const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
	if (GADN && CDN)
	to_replace =
	getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
	} else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
	LDAddrNode->getNumOperands() > 0) {
	LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');

	SDValue OP1 = LDAddrNode->getOperand(0);
	if (const GlobalAddressSDNode *GADN =
	dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
	to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
	}

	if (!to_replace)
	return;

	// replacing the old with a new value
	uint64_t val;
	if (size == 1)
	val = new_val.c[0];
	else if (size == 2)
	val = new_val.s;
	else if (size == 4)
	val = new_val.i;
	else {
	val = new_val.d;
	}

	LLVM_DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
	<< val << '\n');
	- SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
	+ SDValue NVal = CurDAG->getConstant(val, DL, LD->getValueType(0));

	// After replacement, the current node is dead, we need to
	// go backward one step to make iterator still work
	I--;
	SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
	SDValue To[] = {NVal, NVal};
	CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
	I++;
	// It is safe to delete node now
	CurDAG->DeleteNode(Node);
	}

	void BPFDAGToDAGISel::PreprocessISelDAG() {
	// Iterate through all nodes, interested in the following case:
	//
	// . loads from ConstantStruct or ConstantArray of constructs
	// which can be turns into constant itself, with this we can
	// avoid reading from read-only section at runtime.
	//
	// . Removing redundant AND for intrinsic narrow loads.
	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
	E = CurDAG->allnodes_end();
	I != E;) {
	SDNode Node = &I++;
	unsigned Opcode = Node->getOpcode();
	if (Opcode == ISD::LOAD)
	PreprocessLoad(Node, I);
	else if (Opcode == ISD::AND)
	PreprocessTrunc(Node, I);
	}
	}

	bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
	uint64_t Offset, uint64_t Size,
	unsigned char *ByteSeq) {
	const GlobalVariable *V = dyn_cast<GlobalVariable>(Node->getGlobal());

	if (!V \|\| !V->hasInitializer())
	return false;

	const Constant *Init = V->getInitializer();
	const DataLayout &DL = CurDAG->getDataLayout();
	val_vec_type TmpVal;

	auto it = cs_vals_.find(static_cast<const void *>(Init));
	if (it != cs_vals_.end()) {
	TmpVal = it->second;
	} else {
	uint64_t total_size = 0;
	if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(Init))
	total_size =
	DL.getStructLayout(cast<StructType>(CS->getType()))->getSizeInBytes();
	else if (const ConstantArray *CA = dyn_cast<ConstantArray>(Init))
	total_size = DL.getTypeAllocSize(CA->getType()->getElementType()) *
	CA->getNumOperands();
	else
	return false;

	val_vec_type Vals(total_size, 0);
	if (fillGenericConstant(DL, Init, Vals, 0) == false)
	return false;
	cs_vals_[static_cast<const void *>(Init)] = Vals;
	TmpVal = std::move(Vals);
	}

	// test whether host endianness matches target
	union {
	uint8_t c[2];
	uint16_t s;
	} test_buf;
	uint16_t test_val = 0x2345;
	if (DL.isLittleEndian())
	support::endian::write16le(test_buf.c, test_val);
	else
	support::endian::write16be(test_buf.c, test_val);

	bool endian_match = test_buf.s == test_val;
	for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++)
	ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j];

	return true;
	}

	bool BPFDAGToDAGISel::fillGenericConstant(const DataLayout &DL,
	const Constant *CV,
	val_vec_type &Vals, uint64_t Offset) {
	uint64_t Size = DL.getTypeAllocSize(CV->getType());

	if (isa<ConstantAggregateZero>(CV) \|\| isa<UndefValue>(CV))
	return true; // already done

	if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
	uint64_t val = CI->getZExtValue();
	LLVM_DEBUG(dbgs() << "Byte array at offset " << Offset << " with value "
	<< val << '\n');

	if (Size > 8 \|\| (Size & (Size - 1)))
	return false;

	// Store based on target endian
	for (uint64_t i = 0; i < Size; ++i) {
	Vals[Offset + i] = DL.isLittleEndian()
	? ((val >> (i * 8)) & 0xFF)
	: ((val >> ((Size - i - 1) * 8)) & 0xFF);
	}
	return true;
	}

	if (const ConstantDataArray *CDA = dyn_cast<ConstantDataArray>(CV))
	return fillConstantDataArray(DL, CDA, Vals, Offset);

	if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV))
	return fillConstantArray(DL, CA, Vals, Offset);

	if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
	return fillConstantStruct(DL, CVS, Vals, Offset);

	return false;
	}

	bool BPFDAGToDAGISel::fillConstantDataArray(const DataLayout &DL,
	const ConstantDataArray *CDA,
	val_vec_type &Vals, int Offset) {
	for (unsigned i = 0, e = CDA->getNumElements(); i != e; ++i) {
	if (fillGenericConstant(DL, CDA->getElementAsConstant(i), Vals, Offset) ==
	false)
	return false;
	Offset += DL.getTypeAllocSize(CDA->getElementAsConstant(i)->getType());
	}

	return true;
	}

	bool BPFDAGToDAGISel::fillConstantArray(const DataLayout &DL,
	const ConstantArray *CA,
	val_vec_type &Vals, int Offset) {
	for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
	if (fillGenericConstant(DL, CA->getOperand(i), Vals, Offset) == false)
	return false;
	Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
	}

	return true;
	}

	bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
	const ConstantStruct *CS,
	val_vec_type &Vals, int Offset) {
	const StructLayout *Layout = DL.getStructLayout(CS->getType());
	for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
	const Constant *Field = CS->getOperand(i);
	uint64_t SizeSoFar = Layout->getElementOffset(i);
	if (fillGenericConstant(DL, Field, Vals, Offset + SizeSoFar) == false)
	return false;
	}
	return true;
	}

	void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
	SelectionDAG::allnodes_iterator &I) {
	ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
	if (!MaskN)
	return;

	// The Reg operand should be a virtual register, which is defined
	// outside the current basic block. DAG combiner has done a pretty
	// good job in removing truncating inside a single basic block except
	// when the Reg operand comes from bpf_load_[byte \| half \| word] for
	// which the generic optimizer doesn't understand their results are
	// zero extended.
	SDValue BaseV = Node->getOperand(0);
	if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN)
	return;

	unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
	uint64_t MaskV = MaskN->getZExtValue();

	if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) \|\|
	(IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) \|\|
	(IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
	return;

	LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
	Node->dump(); dbgs() << '\n');

	I--;
	CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
	I++;
	CurDAG->DeleteNode(Node);

	return;
	}

	FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
	return new BPFDAGToDAGISel(TM);
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp (revision 362609)
	@@ -1,1240 +1,1272 @@
	//===- BTFDebug.cpp - BTF Generator ---------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains support for writing BTF debug info.
	//
	//===----------------------------------------------------------------------===//

	#include "BTFDebug.h"
	#include "BPF.h"
	#include "BPFCORE.h"
	#include "MCTargetDesc/BPFMCTargetDesc.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/CodeGen/AsmPrinter.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/Support/LineIterator.h"

	using namespace llvm;

	static const char *BTFKindStr[] = {
	#define HANDLE_BTF_KIND(ID, NAME) "BTF_KIND_" #NAME,
	#include "BTF.def"
	};

	/// Emit a BTF common type.
	void BTFTypeBase::emitType(MCStreamer &OS) {
	OS.AddComment(std::string(BTFKindStr[Kind]) + "(id = " + std::to_string(Id) +
	")");
	OS.EmitIntValue(BTFType.NameOff, 4);
	OS.AddComment("0x" + Twine::utohexstr(BTFType.Info));
	OS.EmitIntValue(BTFType.Info, 4);
	OS.EmitIntValue(BTFType.Size, 4);
	}

	BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
	bool NeedsFixup)
	: DTy(DTy), NeedsFixup(NeedsFixup) {
	switch (Tag) {
	case dwarf::DW_TAG_pointer_type:
	Kind = BTF::BTF_KIND_PTR;
	break;
	case dwarf::DW_TAG_const_type:
	Kind = BTF::BTF_KIND_CONST;
	break;
	case dwarf::DW_TAG_volatile_type:
	Kind = BTF::BTF_KIND_VOLATILE;
	break;
	case dwarf::DW_TAG_typedef:
	Kind = BTF::BTF_KIND_TYPEDEF;
	break;
	case dwarf::DW_TAG_restrict_type:
	Kind = BTF::BTF_KIND_RESTRICT;
	break;
	default:
	llvm_unreachable("Unknown DIDerivedType Tag");
	}
	BTFType.Info = Kind << 24;
	}

	void BTFTypeDerived::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(DTy->getName());

	if (NeedsFixup)
	return;

	// The base type for PTR/CONST/VOLATILE could be void.
	const DIType *ResolvedType = DTy->getBaseType();
	if (!ResolvedType) {
	assert((Kind == BTF::BTF_KIND_PTR \|\| Kind == BTF::BTF_KIND_CONST \|\|
	Kind == BTF::BTF_KIND_VOLATILE) &&
	"Invalid null basetype");
	BTFType.Type = 0;
	} else {
	BTFType.Type = BDebug.getTypeId(ResolvedType);
	}
	}

	void BTFTypeDerived::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }

	void BTFTypeDerived::setPointeeType(uint32_t PointeeType) {
	BTFType.Type = PointeeType;
	}

	/// Represent a struct/union forward declaration.
	BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) {
	Kind = BTF::BTF_KIND_FWD;
	BTFType.Info = IsUnion << 31 \| Kind << 24;
	BTFType.Type = 0;
	}

	void BTFTypeFwd::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFTypeFwd::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }

	BTFTypeInt::BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits,
	uint32_t OffsetInBits, StringRef TypeName)
	: Name(TypeName) {
	// Translate IR int encoding to BTF int encoding.
	uint8_t BTFEncoding;
	switch (Encoding) {
	case dwarf::DW_ATE_boolean:
	BTFEncoding = BTF::INT_BOOL;
	break;
	case dwarf::DW_ATE_signed:
	case dwarf::DW_ATE_signed_char:
	BTFEncoding = BTF::INT_SIGNED;
	break;
	case dwarf::DW_ATE_unsigned:
	case dwarf::DW_ATE_unsigned_char:
	BTFEncoding = 0;
	break;
	default:
	llvm_unreachable("Unknown BTFTypeInt Encoding");
	}

	Kind = BTF::BTF_KIND_INT;
	BTFType.Info = Kind << 24;
	BTFType.Size = roundupToBytes(SizeInBits);
	IntVal = (BTFEncoding << 24) \| OffsetInBits << 16 \| SizeInBits;
	}

	void BTFTypeInt::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFTypeInt::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	OS.AddComment("0x" + Twine::utohexstr(IntVal));
	OS.EmitIntValue(IntVal, 4);
	}

	BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) {
	Kind = BTF::BTF_KIND_ENUM;
	BTFType.Info = Kind << 24 \| VLen;
	BTFType.Size = roundupToBytes(ETy->getSizeInBits());
	}

	void BTFTypeEnum::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(ETy->getName());

	DINodeArray Elements = ETy->getElements();
	for (const auto Element : Elements) {
	const auto *Enum = cast<DIEnumerator>(Element);

	struct BTF::BTFEnum BTFEnum;
	BTFEnum.NameOff = BDebug.addString(Enum->getName());
	// BTF enum value is 32bit, enforce it.
	BTFEnum.Val = static_cast<uint32_t>(Enum->getValue());
	EnumValues.push_back(BTFEnum);
	}
	}

	void BTFTypeEnum::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	for (const auto &Enum : EnumValues) {
	OS.EmitIntValue(Enum.NameOff, 4);
	OS.EmitIntValue(Enum.Val, 4);
	}
	}

	BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems) {
	Kind = BTF::BTF_KIND_ARRAY;
	BTFType.NameOff = 0;
	BTFType.Info = Kind << 24;
	BTFType.Size = 0;

	ArrayInfo.ElemType = ElemTypeId;
	ArrayInfo.Nelems = NumElems;
	}

	/// Represent a BTF array.
	void BTFTypeArray::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	// The IR does not really have a type for the index.
	// A special type for array index should have been
	// created during initial type traversal. Just
	// retrieve that type id.
	ArrayInfo.IndexType = BDebug.getArrayIndexTypeId();
	}

	void BTFTypeArray::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	OS.EmitIntValue(ArrayInfo.ElemType, 4);
	OS.EmitIntValue(ArrayInfo.IndexType, 4);
	OS.EmitIntValue(ArrayInfo.Nelems, 4);
	}

	/// Represent either a struct or a union.
	BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
	bool HasBitField, uint32_t Vlen)
	: STy(STy), HasBitField(HasBitField) {
	Kind = IsStruct ? BTF::BTF_KIND_STRUCT : BTF::BTF_KIND_UNION;
	BTFType.Size = roundupToBytes(STy->getSizeInBits());
	BTFType.Info = (HasBitField << 31) \| (Kind << 24) \| Vlen;
	}

	void BTFTypeStruct::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(STy->getName());

	// Add struct/union members.
	const DINodeArray Elements = STy->getElements();
	for (const auto *Element : Elements) {
	struct BTF::BTFMember BTFMember;
	const auto *DDTy = cast<DIDerivedType>(Element);

	BTFMember.NameOff = BDebug.addString(DDTy->getName());
	if (HasBitField) {
	uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
	BTFMember.Offset = BitFieldSize << 24 \| DDTy->getOffsetInBits();
	} else {
	BTFMember.Offset = DDTy->getOffsetInBits();
	}
	const auto *BaseTy = DDTy->getBaseType();
	BTFMember.Type = BDebug.getTypeId(BaseTy);
	Members.push_back(BTFMember);
	}
	}

	void BTFTypeStruct::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	for (const auto &Member : Members) {
	OS.EmitIntValue(Member.NameOff, 4);
	OS.EmitIntValue(Member.Type, 4);
	OS.AddComment("0x" + Twine::utohexstr(Member.Offset));
	OS.EmitIntValue(Member.Offset, 4);
	}
	}

	std::string BTFTypeStruct::getName() { return STy->getName(); }

	/// The Func kind represents both subprogram and pointee of function
	/// pointers. If the FuncName is empty, it represents a pointee of function
	/// pointer. Otherwise, it represents a subprogram. The func arg names
	/// are empty for pointee of function pointer case, and are valid names
	/// for subprogram.
	BTFTypeFuncProto::BTFTypeFuncProto(
	const DISubroutineType *STy, uint32_t VLen,
	const std::unordered_map<uint32_t, StringRef> &FuncArgNames)
	: STy(STy), FuncArgNames(FuncArgNames) {
	Kind = BTF::BTF_KIND_FUNC_PROTO;
	BTFType.Info = (Kind << 24) \| VLen;
	}

	void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	DITypeRefArray Elements = STy->getTypeArray();
	auto RetType = Elements[0];
	BTFType.Type = RetType ? BDebug.getTypeId(RetType) : 0;
	BTFType.NameOff = 0;

	// For null parameter which is typically the last one
	// to represent the vararg, encode the NameOff/Type to be 0.
	for (unsigned I = 1, N = Elements.size(); I < N; ++I) {
	struct BTF::BTFParam Param;
	auto Element = Elements[I];
	if (Element) {
	Param.NameOff = BDebug.addString(FuncArgNames[I]);
	Param.Type = BDebug.getTypeId(Element);
	} else {
	Param.NameOff = 0;
	Param.Type = 0;
	}
	Parameters.push_back(Param);
	}
	}

	void BTFTypeFuncProto::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	for (const auto &Param : Parameters) {
	OS.EmitIntValue(Param.NameOff, 4);
	OS.EmitIntValue(Param.Type, 4);
	}
	}

	BTFTypeFunc::BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId,
	uint32_t Scope)
	: Name(FuncName) {
	Kind = BTF::BTF_KIND_FUNC;
	BTFType.Info = (Kind << 24) \| Scope;
	BTFType.Type = ProtoTypeId;
	}

	void BTFTypeFunc::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFTypeFunc::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }

	BTFKindVar::BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo)
	: Name(VarName) {
	Kind = BTF::BTF_KIND_VAR;
	BTFType.Info = Kind << 24;
	BTFType.Type = TypeId;
	Info = VarInfo;
	}

	void BTFKindVar::completeType(BTFDebug &BDebug) {
	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFKindVar::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	OS.EmitIntValue(Info, 4);
	}

	BTFKindDataSec::BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName)
	: Asm(AsmPrt), Name(SecName) {
	Kind = BTF::BTF_KIND_DATASEC;
	BTFType.Info = Kind << 24;
	BTFType.Size = 0;
	}

	void BTFKindDataSec::completeType(BTFDebug &BDebug) {
	BTFType.NameOff = BDebug.addString(Name);
	BTFType.Info \|= Vars.size();
	}

	void BTFKindDataSec::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);

	for (const auto &V : Vars) {
	OS.EmitIntValue(std::get<0>(V), 4);
	Asm->EmitLabelReference(std::get<1>(V), 4);
	OS.EmitIntValue(std::get<2>(V), 4);
	}
	}

	uint32_t BTFStringTable::addString(StringRef S) {
	// Check whether the string already exists.
	for (auto &OffsetM : OffsetToIdMap) {
	if (Table[OffsetM.second] == S)
	return OffsetM.first;
	}
	// Not find, add to the string table.
	uint32_t Offset = Size;
	OffsetToIdMap[Offset] = Table.size();
	Table.push_back(S);
	Size += S.size() + 1;
	return Offset;
	}

	BTFDebug::BTFDebug(AsmPrinter *AP)
	: DebugHandlerBase(AP), OS(*Asm->OutStreamer), SkipInstruction(false),
	LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0),
	MapDefNotCollected(true) {
	addString("\0");
	}

	uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry,
	const DIType *Ty) {
	TypeEntry->setId(TypeEntries.size() + 1);
	uint32_t Id = TypeEntry->getId();
	DIToIdMap[Ty] = Id;
	TypeEntries.push_back(std::move(TypeEntry));
	return Id;
	}

	uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
	TypeEntry->setId(TypeEntries.size() + 1);
	uint32_t Id = TypeEntry->getId();
	TypeEntries.push_back(std::move(TypeEntry));
	return Id;
	}

	void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
	// Only int types are supported in BTF.
	uint32_t Encoding = BTy->getEncoding();
	if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed &&
	Encoding != dwarf::DW_ATE_signed_char &&
	Encoding != dwarf::DW_ATE_unsigned &&
	Encoding != dwarf::DW_ATE_unsigned_char)
	return;

	// Create a BTF type instance for this DIBasicType and put it into
	// DIToIdMap for cross-type reference check.
	auto TypeEntry = std::make_unique<BTFTypeInt>(
	Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
	TypeId = addType(std::move(TypeEntry), BTy);
	}

	/// Handle subprogram or subroutine types.
	void BTFDebug::visitSubroutineType(
	const DISubroutineType *STy, bool ForSubprog,
	const std::unordered_map<uint32_t, StringRef> &FuncArgNames,
	uint32_t &TypeId) {
	DITypeRefArray Elements = STy->getTypeArray();
	uint32_t VLen = Elements.size() - 1;
	if (VLen > BTF::MAX_VLEN)
	return;

	// Subprogram has a valid non-zero-length name, and the pointee of
	// a function pointer has an empty name. The subprogram type will
	// not be added to DIToIdMap as it should not be referenced by
	// any other types.
	auto TypeEntry = std::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
	if (ForSubprog)
	TypeId = addType(std::move(TypeEntry)); // For subprogram
	else
	TypeId = addType(std::move(TypeEntry), STy); // For func ptr

	// Visit return type and func arg types.
	for (const auto Element : Elements) {
	visitTypeEntry(Element);
	}
	}

	/// Handle structure/union types.
	void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
	uint32_t &TypeId) {
	const DINodeArray Elements = CTy->getElements();
	uint32_t VLen = Elements.size();
	if (VLen > BTF::MAX_VLEN)
	return;

	// Check whether we have any bitfield members or not
	bool HasBitField = false;
	for (const auto *Element : Elements) {
	auto E = cast<DIDerivedType>(Element);
	if (E->isBitField()) {
	HasBitField = true;
	break;
	}
	}

	auto TypeEntry =
	std::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
	StructTypes.push_back(TypeEntry.get());
	TypeId = addType(std::move(TypeEntry), CTy);

	// Visit all struct members.
	for (const auto *Element : Elements)
	visitTypeEntry(cast<DIDerivedType>(Element));
	}

	void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
	// Visit array element type.
	uint32_t ElemTypeId;
	const DIType *ElemType = CTy->getBaseType();
	visitTypeEntry(ElemType, ElemTypeId, false, false);

	// Visit array dimensions.
	DINodeArray Elements = CTy->getElements();
	for (int I = Elements.size() - 1; I >= 0; --I) {
	if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
	if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
	const DISubrange *SR = cast<DISubrange>(Element);
	auto CI = SR->getCount().dyn_cast<ConstantInt >();
	int64_t Count = CI->getSExtValue();

	// For struct s { int b; char c[]; }, the c[] will be represented
	// as an array with Count = -1.
	auto TypeEntry =
	std::make_unique<BTFTypeArray>(ElemTypeId,
	Count >= 0 ? Count : 0);
	if (I == 0)
	ElemTypeId = addType(std::move(TypeEntry), CTy);
	else
	ElemTypeId = addType(std::move(TypeEntry));
	}
	}

	// The array TypeId is the type id of the outermost dimension.
	TypeId = ElemTypeId;

	// The IR does not have a type for array index while BTF wants one.
	// So create an array index type if there is none.
	if (!ArrayIndexTypeId) {
	auto TypeEntry = std::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
	0, "__ARRAY_SIZE_TYPE__");
	ArrayIndexTypeId = addType(std::move(TypeEntry));
	}
	}

	void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
	DINodeArray Elements = CTy->getElements();
	uint32_t VLen = Elements.size();
	if (VLen > BTF::MAX_VLEN)
	return;

	auto TypeEntry = std::make_unique<BTFTypeEnum>(CTy, VLen);
	TypeId = addType(std::move(TypeEntry), CTy);
	// No need to visit base type as BTF does not encode it.
	}

	/// Handle structure/union forward declarations.
	void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
	uint32_t &TypeId) {
	auto TypeEntry = std::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
	TypeId = addType(std::move(TypeEntry), CTy);
	}

	/// Handle structure, union, array and enumeration types.
	void BTFDebug::visitCompositeType(const DICompositeType *CTy,
	uint32_t &TypeId) {
	auto Tag = CTy->getTag();
	if (Tag == dwarf::DW_TAG_structure_type \|\| Tag == dwarf::DW_TAG_union_type) {
	// Handle forward declaration differently as it does not have members.
	if (CTy->isForwardDecl())
	visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId);
	else
	visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId);
	} else if (Tag == dwarf::DW_TAG_array_type)
	visitArrayType(CTy, TypeId);
	else if (Tag == dwarf::DW_TAG_enumeration_type)
	visitEnumType(CTy, TypeId);
	}

	/// Handle pointer, typedef, const, volatile, restrict and member types.
	void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
	bool CheckPointer, bool SeenPointer) {
	unsigned Tag = DTy->getTag();

	/// Try to avoid chasing pointees, esp. structure pointees which may
	/// unnecessary bring in a lot of types.
	if (CheckPointer && !SeenPointer) {
	SeenPointer = Tag == dwarf::DW_TAG_pointer_type;
	}

	if (CheckPointer && SeenPointer) {
	const DIType *Base = DTy->getBaseType();
	if (Base) {
	if (const auto *CTy = dyn_cast<DICompositeType>(Base)) {
	auto CTag = CTy->getTag();
	if ((CTag == dwarf::DW_TAG_structure_type \|\|
	CTag == dwarf::DW_TAG_union_type) &&
	!CTy->isForwardDecl()) {
	/// Find a candidate, generate a fixup. Later on the struct/union
	/// pointee type will be replaced with either a real type or
	/// a forward declaration.
	auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true);
	auto &Fixup = FixupDerivedTypes[CTy->getName()];
	Fixup.first = CTag == dwarf::DW_TAG_union_type;
	Fixup.second.push_back(TypeEntry.get());
	TypeId = addType(std::move(TypeEntry), DTy);
	return;
	}
	}
	}
	}

	if (Tag == dwarf::DW_TAG_pointer_type \|\| Tag == dwarf::DW_TAG_typedef \|\|
	Tag == dwarf::DW_TAG_const_type \|\| Tag == dwarf::DW_TAG_volatile_type \|\|
	Tag == dwarf::DW_TAG_restrict_type) {
	auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, false);
	TypeId = addType(std::move(TypeEntry), DTy);
	} else if (Tag != dwarf::DW_TAG_member) {
	return;
	}

	// Visit base type of pointer, typedef, const, volatile, restrict or
	// struct/union member.
	uint32_t TempTypeId = 0;
	if (Tag == dwarf::DW_TAG_member)
	visitTypeEntry(DTy->getBaseType(), TempTypeId, true, false);
	else
	visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer);
	}

	void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
	bool CheckPointer, bool SeenPointer) {
	if (!Ty \|\| DIToIdMap.find(Ty) != DIToIdMap.end()) {
	TypeId = DIToIdMap[Ty];
	+
	+ // To handle the case like the following:
	+ // struct t;
	+ // typedef struct t _t;
	+ // struct s1 { _t *c; };
	+ // int test1(struct s1 *arg) { ... }
	+ //
	+ // struct t { int a; int b; };
	+ // struct s2 { _t c; }
	+ // int test2(struct s2 *arg) { ... }
	+ //
	+ // During traversing test1() argument, "_t" is recorded
	+ // in DIToIdMap and a forward declaration fixup is created
	+ // for "struct t" to avoid pointee type traversal.
	+ //
	+ // During traversing test2() argument, even if we see "_t" is
	+ // already defined, we should keep moving to eventually
	+ // bring in types for "struct t". Otherwise, the "struct s2"
	+ // definition won't be correct.
	+ if (Ty && (!CheckPointer \|\| !SeenPointer)) {
	+ if (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
	+ unsigned Tag = DTy->getTag();
	+ if (Tag == dwarf::DW_TAG_typedef \|\| Tag == dwarf::DW_TAG_const_type \|\|
	+ Tag == dwarf::DW_TAG_volatile_type \|\|
	+ Tag == dwarf::DW_TAG_restrict_type) {
	+ uint32_t TmpTypeId;
	+ visitTypeEntry(DTy->getBaseType(), TmpTypeId, CheckPointer,
	+ SeenPointer);
	+ }
	+ }
	+ }
	+
	return;
	}

	if (const auto *BTy = dyn_cast<DIBasicType>(Ty))
	visitBasicType(BTy, TypeId);
	else if (const auto *STy = dyn_cast<DISubroutineType>(Ty))
	visitSubroutineType(STy, false, std::unordered_map<uint32_t, StringRef>(),
	TypeId);
	else if (const auto *CTy = dyn_cast<DICompositeType>(Ty))
	visitCompositeType(CTy, TypeId);
	else if (const auto *DTy = dyn_cast<DIDerivedType>(Ty))
	visitDerivedType(DTy, TypeId, CheckPointer, SeenPointer);
	else
	llvm_unreachable("Unknown DIType");
	}

	void BTFDebug::visitTypeEntry(const DIType *Ty) {
	uint32_t TypeId;
	visitTypeEntry(Ty, TypeId, false, false);
	}

	void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
	if (!Ty \|\| DIToIdMap.find(Ty) != DIToIdMap.end()) {
	TypeId = DIToIdMap[Ty];
	return;
	}

	// MapDef type is a struct type
	const auto *CTy = dyn_cast<DICompositeType>(Ty);
	if (!CTy)
	return;

	auto Tag = CTy->getTag();
	if (Tag != dwarf::DW_TAG_structure_type \|\| CTy->isForwardDecl())
	return;

	// Record this type
	const DINodeArray Elements = CTy->getElements();
	bool HasBitField = false;
	for (const auto *Element : Elements) {
	auto E = cast<DIDerivedType>(Element);
	if (E->isBitField()) {
	HasBitField = true;
	break;
	}
	}

	auto TypeEntry =
	std::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
	StructTypes.push_back(TypeEntry.get());
	TypeId = addType(std::move(TypeEntry), CTy);

	// Visit all struct members
	for (const auto *Element : Elements) {
	const auto *MemberType = cast<DIDerivedType>(Element);
	visitTypeEntry(MemberType->getBaseType());
	}
	}

	/// Read file contents from the actual file or from the source
	std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
	auto File = SP->getFile();
	std::string FileName;

	if (!File->getFilename().startswith("/") && File->getDirectory().size())
	FileName = File->getDirectory().str() + "/" + File->getFilename().str();
	else
	FileName = File->getFilename();

	// No need to populate the contends if it has been populated!
	if (FileContent.find(FileName) != FileContent.end())
	return FileName;

	std::vector<std::string> Content;
	std::string Line;
	Content.push_back(Line); // Line 0 for empty string

	std::unique_ptr<MemoryBuffer> Buf;
	auto Source = File->getSource();
	if (Source)
	Buf = MemoryBuffer::getMemBufferCopy(*Source);
	else if (ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
	MemoryBuffer::getFile(FileName))
	Buf = std::move(*BufOrErr);
	if (Buf)
	for (line_iterator I(*Buf, false), E; I != E; ++I)
	Content.push_back(*I);

	FileContent[FileName] = Content;
	return FileName;
	}

	void BTFDebug::constructLineInfo(const DISubprogram SP, MCSymbol Label,
	uint32_t Line, uint32_t Column) {
	std::string FileName = populateFileContent(SP);
	BTFLineInfo LineInfo;

	LineInfo.Label = Label;
	LineInfo.FileNameOff = addString(FileName);
	// If file content is not available, let LineOff = 0.
	if (Line < FileContent[FileName].size())
	LineInfo.LineOff = addString(FileContent[FileName][Line]);
	else
	LineInfo.LineOff = 0;
	LineInfo.LineNum = Line;
	LineInfo.ColumnNum = Column;
	LineInfoTable[SecNameOff].push_back(LineInfo);
	}

	void BTFDebug::emitCommonHeader() {
	OS.AddComment("0x" + Twine::utohexstr(BTF::MAGIC));
	OS.EmitIntValue(BTF::MAGIC, 2);
	OS.EmitIntValue(BTF::VERSION, 1);
	OS.EmitIntValue(0, 1);
	}

	void BTFDebug::emitBTFSection() {
	// Do not emit section if no types and only "" string.
	if (!TypeEntries.size() && StringTable.getSize() == 1)
	return;

	MCContext &Ctx = OS.getContext();
	OS.SwitchSection(Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0));

	// Emit header.
	emitCommonHeader();
	OS.EmitIntValue(BTF::HeaderSize, 4);

	uint32_t TypeLen = 0, StrLen;
	for (const auto &TypeEntry : TypeEntries)
	TypeLen += TypeEntry->getSize();
	StrLen = StringTable.getSize();

	OS.EmitIntValue(0, 4);
	OS.EmitIntValue(TypeLen, 4);
	OS.EmitIntValue(TypeLen, 4);
	OS.EmitIntValue(StrLen, 4);

	// Emit type table.
	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->emitType(OS);

	// Emit string table.
	uint32_t StringOffset = 0;
	for (const auto &S : StringTable.getTable()) {
	OS.AddComment("string offset=" + std::to_string(StringOffset));
	OS.EmitBytes(S);
	OS.EmitBytes(StringRef("\0", 1));
	StringOffset += S.size() + 1;
	}
	}

	void BTFDebug::emitBTFExtSection() {
	// Do not emit section if empty FuncInfoTable and LineInfoTable
	// and FieldRelocTable.
	if (!FuncInfoTable.size() && !LineInfoTable.size() &&
	!FieldRelocTable.size())
	return;

	MCContext &Ctx = OS.getContext();
	OS.SwitchSection(Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0));

	// Emit header.
	emitCommonHeader();
	OS.EmitIntValue(BTF::ExtHeaderSize, 4);

	// Account for FuncInfo/LineInfo record size as well.
	uint32_t FuncLen = 4, LineLen = 4;
	// Do not account for optional FieldReloc.
	uint32_t FieldRelocLen = 0;
	for (const auto &FuncSec : FuncInfoTable) {
	FuncLen += BTF::SecFuncInfoSize;
	FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize;
	}
	for (const auto &LineSec : LineInfoTable) {
	LineLen += BTF::SecLineInfoSize;
	LineLen += LineSec.second.size() * BTF::BPFLineInfoSize;
	}
	for (const auto &FieldRelocSec : FieldRelocTable) {
	FieldRelocLen += BTF::SecFieldRelocSize;
	FieldRelocLen += FieldRelocSec.second.size() * BTF::BPFFieldRelocSize;
	}

	if (FieldRelocLen)
	FieldRelocLen += 4;

	OS.EmitIntValue(0, 4);
	OS.EmitIntValue(FuncLen, 4);
	OS.EmitIntValue(FuncLen, 4);
	OS.EmitIntValue(LineLen, 4);
	OS.EmitIntValue(FuncLen + LineLen, 4);
	OS.EmitIntValue(FieldRelocLen, 4);

	// Emit func_info table.
	OS.AddComment("FuncInfo");
	OS.EmitIntValue(BTF::BPFFuncInfoSize, 4);
	for (const auto &FuncSec : FuncInfoTable) {
	OS.AddComment("FuncInfo section string offset=" +
	std::to_string(FuncSec.first));
	OS.EmitIntValue(FuncSec.first, 4);
	OS.EmitIntValue(FuncSec.second.size(), 4);
	for (const auto &FuncInfo : FuncSec.second) {
	Asm->EmitLabelReference(FuncInfo.Label, 4);
	OS.EmitIntValue(FuncInfo.TypeId, 4);
	}
	}

	// Emit line_info table.
	OS.AddComment("LineInfo");
	OS.EmitIntValue(BTF::BPFLineInfoSize, 4);
	for (const auto &LineSec : LineInfoTable) {
	OS.AddComment("LineInfo section string offset=" +
	std::to_string(LineSec.first));
	OS.EmitIntValue(LineSec.first, 4);
	OS.EmitIntValue(LineSec.second.size(), 4);
	for (const auto &LineInfo : LineSec.second) {
	Asm->EmitLabelReference(LineInfo.Label, 4);
	OS.EmitIntValue(LineInfo.FileNameOff, 4);
	OS.EmitIntValue(LineInfo.LineOff, 4);
	OS.AddComment("Line " + std::to_string(LineInfo.LineNum) + " Col " +
	std::to_string(LineInfo.ColumnNum));
	OS.EmitIntValue(LineInfo.LineNum << 10 \| LineInfo.ColumnNum, 4);
	}
	}

	// Emit field reloc table.
	if (FieldRelocLen) {
	OS.AddComment("FieldReloc");
	OS.EmitIntValue(BTF::BPFFieldRelocSize, 4);
	for (const auto &FieldRelocSec : FieldRelocTable) {
	OS.AddComment("Field reloc section string offset=" +
	std::to_string(FieldRelocSec.first));
	OS.EmitIntValue(FieldRelocSec.first, 4);
	OS.EmitIntValue(FieldRelocSec.second.size(), 4);
	for (const auto &FieldRelocInfo : FieldRelocSec.second) {
	Asm->EmitLabelReference(FieldRelocInfo.Label, 4);
	OS.EmitIntValue(FieldRelocInfo.TypeID, 4);
	OS.EmitIntValue(FieldRelocInfo.OffsetNameOff, 4);
	OS.EmitIntValue(FieldRelocInfo.RelocKind, 4);
	}
	}
	}
	}

	void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
	auto *SP = MF->getFunction().getSubprogram();
	auto *Unit = SP->getUnit();

	if (Unit->getEmissionKind() == DICompileUnit::NoDebug) {
	SkipInstruction = true;
	return;
	}
	SkipInstruction = false;

	// Collect MapDef types. Map definition needs to collect
	// pointee types. Do it first. Otherwise, for the following
	// case:
	// struct m { ...};
	// struct t {
	// struct m *key;
	// };
	// foo(struct t *arg);
	//
	// struct mapdef {
	// ...
	// struct m *key;
	// ...
	// } __attribute__((section(".maps"))) hash_map;
	//
	// If subroutine foo is traversed first, a type chain
	// "ptr->struct m(fwd)" will be created and later on
	// when traversing mapdef, since "ptr->struct m" exists,
	// the traversal of "struct m" will be omitted.
	if (MapDefNotCollected) {
	processGlobals(true);
	MapDefNotCollected = false;
	}

	// Collect all types locally referenced in this function.
	// Use RetainedNodes so we can collect all argument names
	// even if the argument is not used.
	std::unordered_map<uint32_t, StringRef> FuncArgNames;
	for (const DINode *DN : SP->getRetainedNodes()) {
	if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
	// Collect function arguments for subprogram func type.
	uint32_t Arg = DV->getArg();
	if (Arg) {
	visitTypeEntry(DV->getType());
	FuncArgNames[Arg] = DV->getName();
	}
	}
	}

	// Construct subprogram func proto type.
	uint32_t ProtoTypeId;
	visitSubroutineType(SP->getType(), true, FuncArgNames, ProtoTypeId);

	// Construct subprogram func type
	uint8_t Scope = SP->isLocalToUnit() ? BTF::FUNC_STATIC : BTF::FUNC_GLOBAL;
	auto FuncTypeEntry =
	std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId, Scope);
	uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));

	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->completeType(*this);

	// Construct funcinfo and the first lineinfo for the function.
	MCSymbol *FuncLabel = Asm->getFunctionBegin();
	BTFFuncInfo FuncInfo;
	FuncInfo.Label = FuncLabel;
	FuncInfo.TypeId = FuncTypeId;
	if (FuncLabel->isInSection()) {
	MCSection &Section = FuncLabel->getSection();
	const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
	assert(SectionELF && "Null section for Function Label");
	SecNameOff = addString(SectionELF->getSectionName());
	} else {
	SecNameOff = addString(".text");
	}
	FuncInfoTable[SecNameOff].push_back(FuncInfo);
	}

	void BTFDebug::endFunctionImpl(const MachineFunction *MF) {
	SkipInstruction = false;
	LineInfoGenerated = false;
	SecNameOff = 0;
	}

	/// On-demand populate struct types as requested from abstract member
	/// accessing.
	unsigned BTFDebug::populateStructType(const DIType *Ty) {
	unsigned Id;
	visitTypeEntry(Ty, Id, false, false);
	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->completeType(*this);
	return Id;
	}

	/// Generate a struct member field relocation.
	void BTFDebug::generateFieldReloc(const MCSymbol ORSym, DIType RootTy,
	StringRef AccessPattern) {
	unsigned RootId = populateStructType(RootTy);
	size_t FirstDollar = AccessPattern.find_first_of('$');
	size_t FirstColon = AccessPattern.find_first_of(':');
	size_t SecondColon = AccessPattern.find_first_of(':', FirstColon + 1);
	StringRef IndexPattern = AccessPattern.substr(FirstDollar + 1);
	StringRef RelocKindStr = AccessPattern.substr(FirstColon + 1,
	SecondColon - FirstColon);
	StringRef PatchImmStr = AccessPattern.substr(SecondColon + 1,
	FirstDollar - SecondColon);

	BTFFieldReloc FieldReloc;
	FieldReloc.Label = ORSym;
	FieldReloc.OffsetNameOff = addString(IndexPattern);
	FieldReloc.TypeID = RootId;
	FieldReloc.RelocKind = std::stoull(RelocKindStr);
	PatchImms[AccessPattern.str()] = std::stoul(PatchImmStr);
	FieldRelocTable[SecNameOff].push_back(FieldReloc);
	}

	void BTFDebug::processReloc(const MachineOperand &MO) {
	// check whether this is a candidate or not
	if (MO.isGlobal()) {
	const GlobalValue *GVal = MO.getGlobal();
	auto *GVar = dyn_cast<GlobalVariable>(GVal);
	if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
	MCSymbol *ORSym = OS.getContext().createTempSymbol();
	OS.EmitLabel(ORSym);

	MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
	DIType *Ty = dyn_cast<DIType>(MDN);
	generateFieldReloc(ORSym, Ty, GVar->getName());
	}
	}
	}

	void BTFDebug::beginInstruction(const MachineInstr *MI) {
	DebugHandlerBase::beginInstruction(MI);

	if (SkipInstruction \|\| MI->isMetaInstruction() \|\|
	MI->getFlag(MachineInstr::FrameSetup))
	return;

	if (MI->isInlineAsm()) {
	// Count the number of register definitions to find the asm string.
	unsigned NumDefs = 0;
	for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
	++NumDefs)
	;

	// Skip this inline asm instruction if the asmstr is empty.
	const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
	if (AsmStr[0] == 0)
	return;
	}

	if (MI->getOpcode() == BPF::LD_imm64) {
	// If the insn is "r2 = LD_imm64 @<an AmaAttr global>",
	// add this insn into the .BTF.ext FieldReloc subsection.
	// Relocation looks like:
	// . SecName:
	// . InstOffset
	// . TypeID
	// . OffSetNameOff
	// . RelocType
	// Later, the insn is replaced with "r2 = <offset>"
	// where "<offset>" equals to the offset based on current
	// type definitions.
	processReloc(MI->getOperand(1));
	} else if (MI->getOpcode() == BPF::CORE_MEM \|\|
	MI->getOpcode() == BPF::CORE_ALU32_MEM \|\|
	MI->getOpcode() == BPF::CORE_SHIFT) {
	// relocation insn is a load, store or shift insn.
	processReloc(MI->getOperand(3));
	} else if (MI->getOpcode() == BPF::JAL) {
	// check extern function references
	const MachineOperand &MO = MI->getOperand(0);
	if (MO.isGlobal()) {
	processFuncPrototypes(dyn_cast<Function>(MO.getGlobal()));
	}
	}

	// Skip this instruction if no DebugLoc or the DebugLoc
	// is the same as the previous instruction.
	const DebugLoc &DL = MI->getDebugLoc();
	if (!DL \|\| PrevInstLoc == DL) {
	// This instruction will be skipped, no LineInfo has
	// been generated, construct one based on function signature.
	if (LineInfoGenerated == false) {
	auto *S = MI->getMF()->getFunction().getSubprogram();
	MCSymbol *FuncLabel = Asm->getFunctionBegin();
	constructLineInfo(S, FuncLabel, S->getLine(), 0);
	LineInfoGenerated = true;
	}

	return;
	}

	// Create a temporary label to remember the insn for lineinfo.
	MCSymbol *LineSym = OS.getContext().createTempSymbol();
	OS.EmitLabel(LineSym);

	// Construct the lineinfo.
	auto SP = DL.get()->getScope()->getSubprogram();
	constructLineInfo(SP, LineSym, DL.getLine(), DL.getCol());

	LineInfoGenerated = true;
	PrevInstLoc = DL;
	}

	void BTFDebug::processGlobals(bool ProcessingMapDef) {
	// Collect all types referenced by globals.
	const Module *M = MMI->getModule();
	for (const GlobalVariable &Global : M->globals()) {
	// Decide the section name.
	StringRef SecName;
	if (Global.hasSection()) {
	SecName = Global.getSection();
	} else if (Global.hasInitializer()) {
	// data, bss, or readonly sections
	if (Global.isConstant())
	SecName = ".rodata";
	else
	SecName = Global.getInitializer()->isZeroValue() ? ".bss" : ".data";
	} else {
	// extern variables without explicit section,
	// put them into ".extern" section.
	SecName = ".extern";
	}

	if (ProcessingMapDef != SecName.startswith(".maps"))
	continue;

	SmallVector<DIGlobalVariableExpression *, 1> GVs;
	Global.getDebugInfo(GVs);

	// No type information, mostly internal, skip it.
	if (GVs.size() == 0)
	continue;

	uint32_t GVTypeId = 0;
	for (auto *GVE : GVs) {
	if (SecName.startswith(".maps"))
	visitMapDefType(GVE->getVariable()->getType(), GVTypeId);
	else
	visitTypeEntry(GVE->getVariable()->getType(), GVTypeId, false, false);
	break;
	}

	// Only support the following globals:
	// . static variables
	// . non-static weak or non-weak global variables
	// . weak or non-weak extern global variables
	// Whether DataSec is readonly or not can be found from corresponding ELF
	// section flags. Whether a BTF_KIND_VAR is a weak symbol or not
	// can be found from the corresponding ELF symbol table.
	auto Linkage = Global.getLinkage();
	if (Linkage != GlobalValue::InternalLinkage &&
	Linkage != GlobalValue::ExternalLinkage &&
	Linkage != GlobalValue::WeakAnyLinkage &&
	Linkage != GlobalValue::ExternalWeakLinkage)
	continue;

	uint32_t GVarInfo;
	if (Linkage == GlobalValue::InternalLinkage) {
	GVarInfo = BTF::VAR_STATIC;
	} else if (Global.hasInitializer()) {
	GVarInfo = BTF::VAR_GLOBAL_ALLOCATED;
	} else {
	GVarInfo = BTF::VAR_GLOBAL_EXTERNAL;
	}

	auto VarEntry =
	std::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
	uint32_t VarId = addType(std::move(VarEntry));

	assert(!SecName.empty());

	// Find or create a DataSec
	if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
	DataSecEntries[SecName] = std::make_unique<BTFKindDataSec>(Asm, SecName);
	}

	// Calculate symbol size
	const DataLayout &DL = Global.getParent()->getDataLayout();
	uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());

	DataSecEntries[SecName]->addVar(VarId, Asm->getSymbol(&Global), Size);
	}
	}

	/// Emit proper patchable instructions.
	bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
	if (MI->getOpcode() == BPF::LD_imm64) {
	const MachineOperand &MO = MI->getOperand(1);
	if (MO.isGlobal()) {
	const GlobalValue *GVal = MO.getGlobal();
	auto *GVar = dyn_cast<GlobalVariable>(GVal);
	if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
	// Emit "mov ri, <imm>" for patched immediate.
	uint32_t Imm = PatchImms[GVar->getName().str()];
	OutMI.setOpcode(BPF::MOV_ri);
	OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
	OutMI.addOperand(MCOperand::createImm(Imm));
	return true;
	}
	}
	} else if (MI->getOpcode() == BPF::CORE_MEM \|\|
	MI->getOpcode() == BPF::CORE_ALU32_MEM \|\|
	MI->getOpcode() == BPF::CORE_SHIFT) {
	const MachineOperand &MO = MI->getOperand(3);
	if (MO.isGlobal()) {
	const GlobalValue *GVal = MO.getGlobal();
	auto *GVar = dyn_cast<GlobalVariable>(GVal);
	if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
	uint32_t Imm = PatchImms[GVar->getName().str()];
	OutMI.setOpcode(MI->getOperand(1).getImm());
	if (MI->getOperand(0).isImm())
	OutMI.addOperand(MCOperand::createImm(MI->getOperand(0).getImm()));
	else
	OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
	OutMI.addOperand(MCOperand::createReg(MI->getOperand(2).getReg()));
	OutMI.addOperand(MCOperand::createImm(Imm));
	return true;
	}
	}
	}
	return false;
	}

	void BTFDebug::processFuncPrototypes(const Function *F) {
	if (!F)
	return;

	const DISubprogram *SP = F->getSubprogram();
	if (!SP \|\| SP->isDefinition())
	return;

	// Do not emit again if already emitted.
	if (ProtoFunctions.find(F) != ProtoFunctions.end())
	return;
	ProtoFunctions.insert(F);

	uint32_t ProtoTypeId;
	const std::unordered_map<uint32_t, StringRef> FuncArgNames;
	visitSubroutineType(SP->getType(), false, FuncArgNames, ProtoTypeId);

	uint8_t Scope = BTF::FUNC_EXTERN;
	auto FuncTypeEntry =
	std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId, Scope);
	addType(std::move(FuncTypeEntry));
	}

	void BTFDebug::endModule() {
	// Collect MapDef globals if not collected yet.
	if (MapDefNotCollected) {
	processGlobals(true);
	MapDefNotCollected = false;
	}

	// Collect global types/variables except MapDef globals.
	processGlobals(false);

	for (auto &DataSec : DataSecEntries)
	addType(std::move(DataSec.second));

	// Fixups
	for (auto &Fixup : FixupDerivedTypes) {
	StringRef TypeName = Fixup.first;
	bool IsUnion = Fixup.second.first;

	// Search through struct types
	uint32_t StructTypeId = 0;
	for (const auto &StructType : StructTypes) {
	if (StructType->getName() == TypeName) {
	StructTypeId = StructType->getId();
	break;
	}
	}

	if (StructTypeId == 0) {
	auto FwdTypeEntry = std::make_unique<BTFTypeFwd>(TypeName, IsUnion);
	StructTypeId = addType(std::move(FwdTypeEntry));
	}

	for (auto &DType : Fixup.second.second) {
	DType->setPointeeType(StructTypeId);
	}
	}

	// Complete BTF type cross refereences.
	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->completeType(*this);

	// Emit BTF sections.
	emitBTFSection();
	emitBTFExtSection();
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.cpp (nonexistent)
	@@ -1,1835 +0,0 @@
	-//===- RDFGraph.cpp -------------------------------------------------------===//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-//
	-// Target-independent, SSA-based data flow graph for register data flow (RDF).
	-//
	-#include "RDFGraph.h"
	-#include "RDFRegisters.h"
	-#include "llvm/ADT/BitVector.h"
	-#include "llvm/ADT/STLExtras.h"
	-#include "llvm/ADT/SetVector.h"
	-#include "llvm/CodeGen/MachineBasicBlock.h"
	-#include "llvm/CodeGen/MachineDominanceFrontier.h"
	-#include "llvm/CodeGen/MachineDominators.h"
	-#include "llvm/CodeGen/MachineFunction.h"
	-#include "llvm/CodeGen/MachineInstr.h"
	-#include "llvm/CodeGen/MachineOperand.h"
	-#include "llvm/CodeGen/MachineRegisterInfo.h"
	-#include "llvm/CodeGen/TargetInstrInfo.h"
	-#include "llvm/CodeGen/TargetLowering.h"
	-#include "llvm/CodeGen/TargetRegisterInfo.h"
	-#include "llvm/CodeGen/TargetSubtargetInfo.h"
	-#include "llvm/IR/Function.h"
	-#include "llvm/MC/LaneBitmask.h"
	-#include "llvm/MC/MCInstrDesc.h"
	-#include "llvm/MC/MCRegisterInfo.h"
	-#include "llvm/Support/Debug.h"
	-#include "llvm/Support/ErrorHandling.h"
	-#include "llvm/Support/raw_ostream.h"
	-#include <algorithm>
	-#include <cassert>
	-#include <cstdint>
	-#include <cstring>
	-#include <iterator>
	-#include <set>
	-#include <utility>
	-#include <vector>
	-
	-using namespace llvm;
	-using namespace rdf;
	-
	-// Printing functions. Have them here first, so that the rest of the code
	-// can use them.
	-namespace llvm {
	-namespace rdf {
	-
	-raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) {
	- if (!P.Mask.all())
	- OS << ':' << PrintLaneMask(P.Mask);
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
	- auto &TRI = P.G.getTRI();
	- if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
	- OS << TRI.getName(P.Obj.Reg);
	- else
	- OS << '#' << P.Obj.Reg;
	- OS << PrintLaneMaskOpt(P.Obj.Mask);
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
	- auto NA = P.G.addr<NodeBase*>(P.Obj);
	- uint16_t Attrs = NA.Addr->getAttrs();
	- uint16_t Kind = NodeAttrs::kind(Attrs);
	- uint16_t Flags = NodeAttrs::flags(Attrs);
	- switch (NodeAttrs::type(Attrs)) {
	- case NodeAttrs::Code:
	- switch (Kind) {
	- case NodeAttrs::Func: OS << 'f'; break;
	- case NodeAttrs::Block: OS << 'b'; break;
	- case NodeAttrs::Stmt: OS << 's'; break;
	- case NodeAttrs::Phi: OS << 'p'; break;
	- default: OS << "c?"; break;
	- }
	- break;
	- case NodeAttrs::Ref:
	- if (Flags & NodeAttrs::Undef)
	- OS << '/';
	- if (Flags & NodeAttrs::Dead)
	- OS << '\\';
	- if (Flags & NodeAttrs::Preserving)
	- OS << '+';
	- if (Flags & NodeAttrs::Clobbering)
	- OS << '~';
	- switch (Kind) {
	- case NodeAttrs::Use: OS << 'u'; break;
	- case NodeAttrs::Def: OS << 'd'; break;
	- case NodeAttrs::Block: OS << 'b'; break;
	- default: OS << "r?"; break;
	- }
	- break;
	- default:
	- OS << '?';
	- break;
	- }
	- OS << P.Obj;
	- if (Flags & NodeAttrs::Shadow)
	- OS << '"';
	- return OS;
	-}
	-
	-static void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
	- const DataFlowGraph &G) {
	- OS << Print<NodeId>(RA.Id, G) << '<'
	- << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
	- if (RA.Addr->getFlags() & NodeAttrs::Fixed)
	- OS << '!';
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
	- printRefHeader(OS, P.Obj, P.G);
	- OS << '(';
	- if (NodeId N = P.Obj.Addr->getReachingDef())
	- OS << Print<NodeId>(N, P.G);
	- OS << ',';
	- if (NodeId N = P.Obj.Addr->getReachedDef())
	- OS << Print<NodeId>(N, P.G);
	- OS << ',';
	- if (NodeId N = P.Obj.Addr->getReachedUse())
	- OS << Print<NodeId>(N, P.G);
	- OS << "):";
	- if (NodeId N = P.Obj.Addr->getSibling())
	- OS << Print<NodeId>(N, P.G);
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
	- printRefHeader(OS, P.Obj, P.G);
	- OS << '(';
	- if (NodeId N = P.Obj.Addr->getReachingDef())
	- OS << Print<NodeId>(N, P.G);
	- OS << "):";
	- if (NodeId N = P.Obj.Addr->getSibling())
	- OS << Print<NodeId>(N, P.G);
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS,
	- const Print<NodeAddr<PhiUseNode*>> &P) {
	- printRefHeader(OS, P.Obj, P.G);
	- OS << '(';
	- if (NodeId N = P.Obj.Addr->getReachingDef())
	- OS << Print<NodeId>(N, P.G);
	- OS << ',';
	- if (NodeId N = P.Obj.Addr->getPredecessor())
	- OS << Print<NodeId>(N, P.G);
	- OS << "):";
	- if (NodeId N = P.Obj.Addr->getSibling())
	- OS << Print<NodeId>(N, P.G);
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
	- switch (P.Obj.Addr->getKind()) {
	- case NodeAttrs::Def:
	- OS << PrintNode<DefNode*>(P.Obj, P.G);
	- break;
	- case NodeAttrs::Use:
	- if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
	- OS << PrintNode<PhiUseNode*>(P.Obj, P.G);
	- else
	- OS << PrintNode<UseNode*>(P.Obj, P.G);
	- break;
	- }
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
	- unsigned N = P.Obj.size();
	- for (auto I : P.Obj) {
	- OS << Print<NodeId>(I.Id, P.G);
	- if (--N)
	- OS << ' ';
	- }
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
	- unsigned N = P.Obj.size();
	- for (auto I : P.Obj) {
	- OS << Print<NodeId>(I, P.G);
	- if (--N)
	- OS << ' ';
	- }
	- return OS;
	-}
	-
	-namespace {
	-
	- template <typename T>
	- struct PrintListV {
	- PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
	-
	- using Type = T;
	- const NodeList &List;
	- const DataFlowGraph &G;
	- };
	-
	- template <typename T>
	- raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) {
	- unsigned N = P.List.size();
	- for (NodeAddr<T> A : P.List) {
	- OS << PrintNode<T>(A, P.G);
	- if (--N)
	- OS << ", ";
	- }
	- return OS;
	- }
	-
	-} // end anonymous namespace
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
	- OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
	- << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
	- return OS;
	-}
	-
	-raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<StmtNode *>> &P) {
	- const MachineInstr &MI = *P.Obj.Addr->getCode();
	- unsigned Opc = MI.getOpcode();
	- OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc);
	- // Print the target for calls and branches (for readability).
	- if (MI.isCall() \|\| MI.isBranch()) {
	- MachineInstr::const_mop_iterator T =
	- llvm::find_if(MI.operands(),
	- [] (const MachineOperand &Op) -> bool {
	- return Op.isMBB() \|\| Op.isGlobal() \|\| Op.isSymbol();
	- });
	- if (T != MI.operands_end()) {
	- OS << ' ';
	- if (T->isMBB())
	- OS << printMBBReference(*T->getMBB());
	- else if (T->isGlobal())
	- OS << T->getGlobal()->getName();
	- else if (T->isSymbol())
	- OS << T->getSymbolName();
	- }
	- }
	- OS << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS,
	- const Print<NodeAddr<InstrNode*>> &P) {
	- switch (P.Obj.Addr->getKind()) {
	- case NodeAttrs::Phi:
	- OS << PrintNode<PhiNode*>(P.Obj, P.G);
	- break;
	- case NodeAttrs::Stmt:
	- OS << PrintNode<StmtNode*>(P.Obj, P.G);
	- break;
	- default:
	- OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G);
	- break;
	- }
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS,
	- const Print<NodeAddr<BlockNode*>> &P) {
	- MachineBasicBlock *BB = P.Obj.Addr->getCode();
	- unsigned NP = BB->pred_size();
	- std::vector<int> Ns;
	- auto PrintBBs = [&OS] (std::vector<int> Ns) -> void {
	- unsigned N = Ns.size();
	- for (int I : Ns) {
	- OS << "%bb." << I;
	- if (--N)
	- OS << ", ";
	- }
	- };
	-
	- OS << Print<NodeId>(P.Obj.Id, P.G) << ": --- " << printMBBReference(*BB)
	- << " --- preds(" << NP << "): ";
	- for (MachineBasicBlock *B : BB->predecessors())
	- Ns.push_back(B->getNumber());
	- PrintBBs(Ns);
	-
	- unsigned NS = BB->succ_size();
	- OS << " succs(" << NS << "): ";
	- Ns.clear();
	- for (MachineBasicBlock *B : BB->successors())
	- Ns.push_back(B->getNumber());
	- PrintBBs(Ns);
	- OS << '\n';
	-
	- for (auto I : P.Obj.Addr->members(P.G))
	- OS << PrintNode<InstrNode*>(I, P.G) << '\n';
	- return OS;
	-}
	-
	-raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<FuncNode *>> &P) {
	- OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
	- << P.Obj.Addr->getCode()->getName() << '\n';
	- for (auto I : P.Obj.Addr->members(P.G))
	- OS << PrintNode<BlockNode*>(I, P.G) << '\n';
	- OS << "]\n";
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
	- OS << '{';
	- for (auto I : P.Obj)
	- OS << ' ' << Print<RegisterRef>(I, P.G);
	- OS << " }";
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterAggr> &P) {
	- P.Obj.print(OS);
	- return OS;
	-}
	-
	-raw_ostream &operator<< (raw_ostream &OS,
	- const Print<DataFlowGraph::DefStack> &P) {
	- for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
	- OS << Print<NodeId>(I->Id, P.G)
	- << '<' << Print<RegisterRef>(I->Addr->getRegRef(P.G), P.G) << '>';
	- I.down();
	- if (I != E)
	- OS << ' ';
	- }
	- return OS;
	-}
	-
	-} // end namespace rdf
	-} // end namespace llvm
	-
	-// Node allocation functions.
	-//
	-// Node allocator is like a slab memory allocator: it allocates blocks of
	-// memory in sizes that are multiples of the size of a node. Each block has
	-// the same size. Nodes are allocated from the currently active block, and
	-// when it becomes full, a new one is created.
	-// There is a mapping scheme between node id and its location in a block,
	-// and within that block is described in the header file.
	-//
	-void NodeAllocator::startNewBlock() {
	- void T = MemPool.Allocate(NodesPerBlockNodeMemSize, NodeMemSize);
	- char P = static_cast<char>(T);
	- Blocks.push_back(P);
	- // Check if the block index is still within the allowed range, i.e. less
	- // than 2^N, where N is the number of bits in NodeId for the block index.
	- // BitsPerIndex is the number of bits per node index.
	- assert((Blocks.size() < ((size_t)1 << (8*sizeof(NodeId)-BitsPerIndex))) &&
	- "Out of bits for block index");
	- ActiveEnd = P;
	-}
	-
	-bool NodeAllocator::needNewBlock() {
	- if (Blocks.empty())
	- return true;
	-
	- char *ActiveBegin = Blocks.back();
	- uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize;
	- return Index >= NodesPerBlock;
	-}
	-
	-NodeAddr<NodeBase*> NodeAllocator::New() {
	- if (needNewBlock())
	- startNewBlock();
	-
	- uint32_t ActiveB = Blocks.size()-1;
	- uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize;
	- NodeAddr<NodeBase> NA = { reinterpret_cast<NodeBase>(ActiveEnd),
	- makeId(ActiveB, Index) };
	- ActiveEnd += NodeMemSize;
	- return NA;
	-}
	-
	-NodeId NodeAllocator::id(const NodeBase *P) const {
	- uintptr_t A = reinterpret_cast<uintptr_t>(P);
	- for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
	- uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
	- if (A < B \|\| A >= B + NodesPerBlock*NodeMemSize)
	- continue;
	- uint32_t Idx = (A-B)/NodeMemSize;
	- return makeId(i, Idx);
	- }
	- llvm_unreachable("Invalid node address");
	-}
	-
	-void NodeAllocator::clear() {
	- MemPool.Reset();
	- Blocks.clear();
	- ActiveEnd = nullptr;
	-}
	-
	-// Insert node NA after "this" in the circular chain.
	-void NodeBase::append(NodeAddr<NodeBase*> NA) {
	- NodeId Nx = Next;
	- // If NA is already "next", do nothing.
	- if (Next != NA.Id) {
	- Next = NA.Id;
	- NA.Addr->Next = Nx;
	- }
	-}
	-
	-// Fundamental node manipulator functions.
	-
	-// Obtain the register reference from a reference node.
	-RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
	- assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
	- if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
	- return G.unpack(Ref.PR);
	- assert(Ref.Op != nullptr);
	- return G.makeRegRef(*Ref.Op);
	-}
	-
	-// Set the register reference in the reference node directly (for references
	-// in phi nodes).
	-void RefNode::setRegRef(RegisterRef RR, DataFlowGraph &G) {
	- assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
	- assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
	- Ref.PR = G.pack(RR);
	-}
	-
	-// Set the register reference in the reference node based on a machine
	-// operand (for references in statement nodes).
	-void RefNode::setRegRef(MachineOperand *Op, DataFlowGraph &G) {
	- assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
	- assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
	- (void)G;
	- Ref.Op = Op;
	-}
	-
	-// Get the owner of a given reference node.
	-NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) {
	- NodeAddr<NodeBase> NA = G.addr<NodeBase>(getNext());
	-
	- while (NA.Addr != this) {
	- if (NA.Addr->getType() == NodeAttrs::Code)
	- return NA;
	- NA = G.addr<NodeBase*>(NA.Addr->getNext());
	- }
	- llvm_unreachable("No owner in circular list");
	-}
	-
	-// Connect the def node to the reaching def node.
	-void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
	- Ref.RD = DA.Id;
	- Ref.Sib = DA.Addr->getReachedDef();
	- DA.Addr->setReachedDef(Self);
	-}
	-
	-// Connect the use node to the reaching def node.
	-void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
	- Ref.RD = DA.Id;
	- Ref.Sib = DA.Addr->getReachedUse();
	- DA.Addr->setReachedUse(Self);
	-}
	-
	-// Get the first member of the code node.
	-NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const {
	- if (Code.FirstM == 0)
	- return NodeAddr<NodeBase*>();
	- return G.addr<NodeBase*>(Code.FirstM);
	-}
	-
	-// Get the last member of the code node.
	-NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const {
	- if (Code.LastM == 0)
	- return NodeAddr<NodeBase*>();
	- return G.addr<NodeBase*>(Code.LastM);
	-}
	-
	-// Add node NA at the end of the member list of the given code node.
	-void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
	- NodeAddr<NodeBase*> ML = getLastMember(G);
	- if (ML.Id != 0) {
	- ML.Addr->append(NA);
	- } else {
	- Code.FirstM = NA.Id;
	- NodeId Self = G.id(this);
	- NA.Addr->setNext(Self);
	- }
	- Code.LastM = NA.Id;
	-}
	-
	-// Add node NA after member node MA in the given code node.
	-void CodeNode::addMemberAfter(NodeAddr<NodeBase> MA, NodeAddr<NodeBase> NA,
	- const DataFlowGraph &G) {
	- MA.Addr->append(NA);
	- if (Code.LastM == MA.Id)
	- Code.LastM = NA.Id;
	-}
	-
	-// Remove member node NA from the given code node.
	-void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
	- NodeAddr<NodeBase*> MA = getFirstMember(G);
	- assert(MA.Id != 0);
	-
	- // Special handling if the member to remove is the first member.
	- if (MA.Id == NA.Id) {
	- if (Code.LastM == MA.Id) {
	- // If it is the only member, set both first and last to 0.
	- Code.FirstM = Code.LastM = 0;
	- } else {
	- // Otherwise, advance the first member.
	- Code.FirstM = MA.Addr->getNext();
	- }
	- return;
	- }
	-
	- while (MA.Addr != this) {
	- NodeId MX = MA.Addr->getNext();
	- if (MX == NA.Id) {
	- MA.Addr->setNext(NA.Addr->getNext());
	- // If the member to remove happens to be the last one, update the
	- // LastM indicator.
	- if (Code.LastM == NA.Id)
	- Code.LastM = MA.Id;
	- return;
	- }
	- MA = G.addr<NodeBase*>(MX);
	- }
	- llvm_unreachable("No such member");
	-}
	-
	-// Return the list of all members of the code node.
	-NodeList CodeNode::members(const DataFlowGraph &G) const {
	- static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; };
	- return members_if(True, G);
	-}
	-
	-// Return the owner of the given instr node.
	-NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) {
	- NodeAddr<NodeBase> NA = G.addr<NodeBase>(getNext());
	-
	- while (NA.Addr != this) {
	- assert(NA.Addr->getType() == NodeAttrs::Code);
	- if (NA.Addr->getKind() == NodeAttrs::Block)
	- return NA;
	- NA = G.addr<NodeBase*>(NA.Addr->getNext());
	- }
	- llvm_unreachable("No owner in circular list");
	-}
	-
	-// Add the phi node PA to the given block node.
	-void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) {
	- NodeAddr<NodeBase*> M = getFirstMember(G);
	- if (M.Id == 0) {
	- addMember(PA, G);
	- return;
	- }
	-
	- assert(M.Addr->getType() == NodeAttrs::Code);
	- if (M.Addr->getKind() == NodeAttrs::Stmt) {
	- // If the first member of the block is a statement, insert the phi as
	- // the first member.
	- Code.FirstM = PA.Id;
	- PA.Addr->setNext(M.Id);
	- } else {
	- // If the first member is a phi, find the last phi, and append PA to it.
	- assert(M.Addr->getKind() == NodeAttrs::Phi);
	- NodeAddr<NodeBase*> MN = M;
	- do {
	- M = MN;
	- MN = G.addr<NodeBase*>(M.Addr->getNext());
	- assert(MN.Addr->getType() == NodeAttrs::Code);
	- } while (MN.Addr->getKind() == NodeAttrs::Phi);
	-
	- // M is the last phi.
	- addMemberAfter(M, PA, G);
	- }
	-}
	-
	-// Find the block node corresponding to the machine basic block BB in the
	-// given func node.
	-NodeAddr<BlockNode> FuncNode::findBlock(const MachineBasicBlock BB,
	- const DataFlowGraph &G) const {
	- auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool {
	- return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB;
	- };
	- NodeList Ms = members_if(EqBB, G);
	- if (!Ms.empty())
	- return Ms[0];
	- return NodeAddr<BlockNode*>();
	-}
	-
	-// Get the block node for the entry block in the given function.
	-NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
	- MachineBasicBlock *EntryB = &getCode()->front();
	- return findBlock(EntryB, G);
	-}
	-
	-// Target operand information.
	-//
	-
	-// For a given instruction, check if there are any bits of RR that can remain
	-// unchanged across this def.
	-bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
	- const {
	- return TII.isPredicated(In);
	-}
	-
	-// Check if the definition of RR produces an unspecified value.
	-bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
	- const {
	- const MachineOperand &Op = In.getOperand(OpNum);
	- if (Op.isRegMask())
	- return true;
	- assert(Op.isReg());
	- if (In.isCall())
	- if (Op.isDef() && Op.isDead())
	- return true;
	- return false;
	-}
	-
	-// Check if the given instruction specifically requires
	-bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
	- const {
	- if (In.isCall() \|\| In.isReturn() \|\| In.isInlineAsm())
	- return true;
	- // Check for a tail call.
	- if (In.isBranch())
	- for (const MachineOperand &O : In.operands())
	- if (O.isGlobal() \|\| O.isSymbol())
	- return true;
	-
	- const MCInstrDesc &D = In.getDesc();
	- if (!D.getImplicitDefs() && !D.getImplicitUses())
	- return false;
	- const MachineOperand &Op = In.getOperand(OpNum);
	- // If there is a sub-register, treat the operand as non-fixed. Currently,
	- // fixed registers are those that are listed in the descriptor as implicit
	- // uses or defs, and those lists do not allow sub-registers.
	- if (Op.getSubReg() != 0)
	- return false;
	- Register Reg = Op.getReg();
	- const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
	- : D.getImplicitUses();
	- if (!ImpR)
	- return false;
	- while (*ImpR)
	- if (*ImpR++ == Reg)
	- return true;
	- return false;
	-}
	-
	-//
	-// The data flow graph construction.
	-//
	-
	-DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
	- const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
	- const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
	- : MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(toi),
	- LiveIns(PRI) {
	-}
	-
	-// The implementation of the definition stack.
	-// Each register reference has its own definition stack. In particular,
	-// for a register references "Reg" and "Reg:subreg" will each have their
	-// own definition stacks.
	-
	-// Construct a stack iterator.
	-DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
	- bool Top) : DS(S) {
	- if (!Top) {
	- // Initialize to bottom.
	- Pos = 0;
	- return;
	- }
	- // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
	- Pos = DS.Stack.size();
	- while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1]))
	- Pos--;
	-}
	-
	-// Return the size of the stack, including block delimiters.
	-unsigned DataFlowGraph::DefStack::size() const {
	- unsigned S = 0;
	- for (auto I = top(), E = bottom(); I != E; I.down())
	- S++;
	- return S;
	-}
	-
	-// Remove the top entry from the stack. Remove all intervening delimiters
	-// so that after this, the stack is either empty, or the top of the stack
	-// is a non-delimiter.
	-void DataFlowGraph::DefStack::pop() {
	- assert(!empty());
	- unsigned P = nextDown(Stack.size());
	- Stack.resize(P);
	-}
	-
	-// Push a delimiter for block node N on the stack.
	-void DataFlowGraph::DefStack::start_block(NodeId N) {
	- assert(N != 0);
	- Stack.push_back(NodeAddr<DefNode*>(nullptr, N));
	-}
	-
	-// Remove all nodes from the top of the stack, until the delimited for
	-// block node N is encountered. Remove the delimiter as well. In effect,
	-// this will remove from the stack all definitions from block N.
	-void DataFlowGraph::DefStack::clear_block(NodeId N) {
	- assert(N != 0);
	- unsigned P = Stack.size();
	- while (P > 0) {
	- bool Found = isDelimiter(Stack[P-1], N);
	- P--;
	- if (Found)
	- break;
	- }
	- // This will also remove the delimiter, if found.
	- Stack.resize(P);
	-}
	-
	-// Move the stack iterator up by one.
	-unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
	- // Get the next valid position after P (skipping all delimiters).
	- // The input position P does not have to point to a non-delimiter.
	- unsigned SS = Stack.size();
	- bool IsDelim;
	- assert(P < SS);
	- do {
	- P++;
	- IsDelim = isDelimiter(Stack[P-1]);
	- } while (P < SS && IsDelim);
	- assert(!IsDelim);
	- return P;
	-}
	-
	-// Move the stack iterator down by one.
	-unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
	- // Get the preceding valid position before P (skipping all delimiters).
	- // The input position P does not have to point to a non-delimiter.
	- assert(P > 0 && P <= Stack.size());
	- bool IsDelim = isDelimiter(Stack[P-1]);
	- do {
	- if (--P == 0)
	- break;
	- IsDelim = isDelimiter(Stack[P-1]);
	- } while (P > 0 && IsDelim);
	- assert(!IsDelim);
	- return P;
	-}
	-
	-// Register information.
	-
	-RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
	- RegisterSet LR;
	- const Function &F = MF.getFunction();
	- const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn()
	- : nullptr;
	- const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
	- if (RegisterId R = TLI.getExceptionPointerRegister(PF))
	- LR.insert(RegisterRef(R));
	- if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
	- LR.insert(RegisterRef(R));
	- return LR;
	-}
	-
	-// Node management functions.
	-
	-// Get the pointer to the node with the id N.
	-NodeBase *DataFlowGraph::ptr(NodeId N) const {
	- if (N == 0)
	- return nullptr;
	- return Memory.ptr(N);
	-}
	-
	-// Get the id of the node at the address P.
	-NodeId DataFlowGraph::id(const NodeBase *P) const {
	- if (P == nullptr)
	- return 0;
	- return Memory.id(P);
	-}
	-
	-// Allocate a new node and set the attributes to Attrs.
	-NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) {
	- NodeAddr<NodeBase*> P = Memory.New();
	- P.Addr->init();
	- P.Addr->setAttrs(Attrs);
	- return P;
	-}
	-
	-// Make a copy of the given node B, except for the data-flow links, which
	-// are set to 0.
	-NodeAddr<NodeBase> DataFlowGraph::cloneNode(const NodeAddr<NodeBase> B) {
	- NodeAddr<NodeBase*> NA = newNode(0);
	- memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
	- // Ref nodes need to have the data-flow links reset.
	- if (NA.Addr->getType() == NodeAttrs::Ref) {
	- NodeAddr<RefNode*> RA = NA;
	- RA.Addr->setReachingDef(0);
	- RA.Addr->setSibling(0);
	- if (NA.Addr->getKind() == NodeAttrs::Def) {
	- NodeAddr<DefNode*> DA = NA;
	- DA.Addr->setReachedDef(0);
	- DA.Addr->setReachedUse(0);
	- }
	- }
	- return NA;
	-}
	-
	-// Allocation routines for specific node types/kinds.
	-
	-NodeAddr<UseNode> DataFlowGraph::newUse(NodeAddr<InstrNode> Owner,
	- MachineOperand &Op, uint16_t Flags) {
	- NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref \| NodeAttrs::Use \| Flags);
	- UA.Addr->setRegRef(&Op, *this);
	- return UA;
	-}
	-
	-NodeAddr<PhiUseNode> DataFlowGraph::newPhiUse(NodeAddr<PhiNode> Owner,
	- RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) {
	- NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref \| NodeAttrs::Use \| Flags);
	- assert(Flags & NodeAttrs::PhiRef);
	- PUA.Addr->setRegRef(RR, *this);
	- PUA.Addr->setPredecessor(PredB.Id);
	- return PUA;
	-}
	-
	-NodeAddr<DefNode> DataFlowGraph::newDef(NodeAddr<InstrNode> Owner,
	- MachineOperand &Op, uint16_t Flags) {
	- NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref \| NodeAttrs::Def \| Flags);
	- DA.Addr->setRegRef(&Op, *this);
	- return DA;
	-}
	-
	-NodeAddr<DefNode> DataFlowGraph::newDef(NodeAddr<InstrNode> Owner,
	- RegisterRef RR, uint16_t Flags) {
	- NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref \| NodeAttrs::Def \| Flags);
	- assert(Flags & NodeAttrs::PhiRef);
	- DA.Addr->setRegRef(RR, *this);
	- return DA;
	-}
	-
	-NodeAddr<PhiNode> DataFlowGraph::newPhi(NodeAddr<BlockNode> Owner) {
	- NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code \| NodeAttrs::Phi);
	- Owner.Addr->addPhi(PA, *this);
	- return PA;
	-}
	-
	-NodeAddr<StmtNode> DataFlowGraph::newStmt(NodeAddr<BlockNode> Owner,
	- MachineInstr *MI) {
	- NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code \| NodeAttrs::Stmt);
	- SA.Addr->setCode(MI);
	- Owner.Addr->addMember(SA, *this);
	- return SA;
	-}
	-
	-NodeAddr<BlockNode> DataFlowGraph::newBlock(NodeAddr<FuncNode> Owner,
	- MachineBasicBlock *BB) {
	- NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code \| NodeAttrs::Block);
	- BA.Addr->setCode(BB);
	- Owner.Addr->addMember(BA, *this);
	- return BA;
	-}
	-
	-NodeAddr<FuncNode> DataFlowGraph::newFunc(MachineFunction MF) {
	- NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code \| NodeAttrs::Func);
	- FA.Addr->setCode(MF);
	- return FA;
	-}
	-
	-// Build the data flow graph.
	-void DataFlowGraph::build(unsigned Options) {
	- reset();
	- Func = newFunc(&MF);
	-
	- if (MF.empty())
	- return;
	-
	- for (MachineBasicBlock &B : MF) {
	- NodeAddr<BlockNode*> BA = newBlock(Func, &B);
	- BlockNodes.insert(std::make_pair(&B, BA));
	- for (MachineInstr &I : B) {
	- if (I.isDebugInstr())
	- continue;
	- buildStmt(BA, I);
	- }
	- }
	-
	- NodeAddr<BlockNode> EA = Func.Addr->getEntryBlock(this);
	- NodeList Blocks = Func.Addr->members(*this);
	-
	- // Collect information about block references.
	- RegisterSet AllRefs;
	- for (NodeAddr<BlockNode*> BA : Blocks)
	- for (NodeAddr<InstrNode> IA : BA.Addr->members(this))
	- for (NodeAddr<RefNode> RA : IA.Addr->members(this))
	- AllRefs.insert(RA.Addr->getRegRef(*this));
	-
	- // Collect function live-ins and entry block live-ins.
	- MachineRegisterInfo &MRI = MF.getRegInfo();
	- MachineBasicBlock &EntryB = *EA.Addr->getCode();
	- assert(EntryB.pred_empty() && "Function entry block has predecessors");
	- for (std::pair<unsigned,unsigned> P : MRI.liveins())
	- LiveIns.insert(RegisterRef(P.first));
	- if (MRI.tracksLiveness()) {
	- for (auto I : EntryB.liveins())
	- LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
	- }
	-
	- // Add function-entry phi nodes for the live-in registers.
	- //for (std::pair<RegisterId,LaneBitmask> P : LiveIns) {
	- for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
	- RegisterRef RR = *I;
	- NodeAddr<PhiNode*> PA = newPhi(EA);
	- uint16_t PhiFlags = NodeAttrs::PhiRef \| NodeAttrs::Preserving;
	- NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
	- PA.Addr->addMember(DA, *this);
	- }
	-
	- // Add phis for landing pads.
	- // Landing pads, unlike usual backs blocks, are not entered through
	- // branches in the program, or fall-throughs from other blocks. They
	- // are entered from the exception handling runtime and target's ABI
	- // may define certain registers as defined on entry to such a block.
	- RegisterSet EHRegs = getLandingPadLiveIns();
	- if (!EHRegs.empty()) {
	- for (NodeAddr<BlockNode*> BA : Blocks) {
	- const MachineBasicBlock &B = *BA.Addr->getCode();
	- if (!B.isEHPad())
	- continue;
	-
	- // Prepare a list of NodeIds of the block's predecessors.
	- NodeList Preds;
	- for (MachineBasicBlock *PB : B.predecessors())
	- Preds.push_back(findBlock(PB));
	-
	- // Build phi nodes for each live-in.
	- for (RegisterRef RR : EHRegs) {
	- NodeAddr<PhiNode*> PA = newPhi(BA);
	- uint16_t PhiFlags = NodeAttrs::PhiRef \| NodeAttrs::Preserving;
	- // Add def:
	- NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
	- PA.Addr->addMember(DA, *this);
	- // Add uses (no reaching defs for phi uses):
	- for (NodeAddr<BlockNode*> PBA : Preds) {
	- NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
	- PA.Addr->addMember(PUA, *this);
	- }
	- }
	- }
	- }
	-
	- // Build a map "PhiM" which will contain, for each block, the set
	- // of references that will require phi definitions in that block.
	- BlockRefsMap PhiM;
	- for (NodeAddr<BlockNode*> BA : Blocks)
	- recordDefsForDF(PhiM, BA);
	- for (NodeAddr<BlockNode*> BA : Blocks)
	- buildPhis(PhiM, AllRefs, BA);
	-
	- // Link all the refs. This will recursively traverse the dominator tree.
	- DefStackMap DM;
	- linkBlockRefs(DM, EA);
	-
	- // Finally, remove all unused phi nodes.
	- if (!(Options & BuildOptions::KeepDeadPhis))
	- removeUnusedPhis();
	-}
	-
	-RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
	- assert(PhysicalRegisterInfo::isRegMaskId(Reg) \|\|
	- Register::isPhysicalRegister(Reg));
	- assert(Reg != 0);
	- if (Sub != 0)
	- Reg = TRI.getSubReg(Reg, Sub);
	- return RegisterRef(Reg);
	-}
	-
	-RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
	- assert(Op.isReg() \|\| Op.isRegMask());
	- if (Op.isReg())
	- return makeRegRef(Op.getReg(), Op.getSubReg());
	- return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll());
	-}
	-
	-RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
	- if (AR.Reg == BR.Reg) {
	- LaneBitmask M = AR.Mask & BR.Mask;
	- return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
	- }
	-#ifndef NDEBUG
	-// RegisterRef NAR = PRI.normalize(AR);
	-// RegisterRef NBR = PRI.normalize(BR);
	-// assert(NAR.Reg != NBR.Reg);
	-#endif
	- // This isn't strictly correct, because the overlap may happen in the
	- // part masked out.
	- if (PRI.alias(AR, BR))
	- return AR;
	- return RegisterRef();
	-}
	-
	-// For each stack in the map DefM, push the delimiter for block B on it.
	-void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
	- // Push block delimiters.
	- for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
	- I->second.start_block(B);
	-}
	-
	-// Remove all definitions coming from block B from each stack in DefM.
	-void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
	- // Pop all defs from this block from the definition stack. Defs that were
	- // added to the map during the traversal of instructions will not have a
	- // delimiter, but for those, the whole stack will be emptied.
	- for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
	- I->second.clear_block(B);
	-
	- // Finally, remove empty stacks from the map.
	- for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
	- NextI = std::next(I);
	- // This preserves the validity of iterators other than I.
	- if (I->second.empty())
	- DefM.erase(I);
	- }
	-}
	-
	-// Push all definitions from the instruction node IA to an appropriate
	-// stack in DefM.
	-void DataFlowGraph::pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
	- pushClobbers(IA, DefM);
	- pushDefs(IA, DefM);
	-}
	-
	-// Push all definitions from the instruction node IA to an appropriate
	-// stack in DefM.
	-void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
	- NodeSet Visited;
	- std::set<RegisterId> Defined;
	-
	- // The important objectives of this function are:
	- // - to be able to handle instructions both while the graph is being
	- // constructed, and after the graph has been constructed, and
	- // - maintain proper ordering of definitions on the stack for each
	- // register reference:
	- // - if there are two or more related defs in IA (i.e. coming from
	- // the same machine operand), then only push one def on the stack,
	- // - if there are multiple unrelated defs of non-overlapping
	- // subregisters of S, then the stack for S will have both (in an
	- // unspecified order), but the order does not matter from the data-
	- // -flow perspective.
	-
	- for (NodeAddr<DefNode> DA : IA.Addr->members_if(IsDef, this)) {
	- if (Visited.count(DA.Id))
	- continue;
	- if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering))
	- continue;
	-
	- NodeList Rel = getRelatedRefs(IA, DA);
	- NodeAddr<DefNode*> PDA = Rel.front();
	- RegisterRef RR = PDA.Addr->getRegRef(*this);
	-
	- // Push the definition on the stack for the register and all aliases.
	- // The def stack traversal in linkNodeUp will check the exact aliasing.
	- DefM[RR.Reg].push(DA);
	- Defined.insert(RR.Reg);
	- for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
	- // Check that we don't push the same def twice.
	- assert(A != RR.Reg);
	- if (!Defined.count(A))
	- DefM[A].push(DA);
	- }
	- // Mark all the related defs as visited.
	- for (NodeAddr<NodeBase*> T : Rel)
	- Visited.insert(T.Id);
	- }
	-}
	-
	-// Push all definitions from the instruction node IA to an appropriate
	-// stack in DefM.
	-void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
	- NodeSet Visited;
	-#ifndef NDEBUG
	- std::set<RegisterId> Defined;
	-#endif
	-
	- // The important objectives of this function are:
	- // - to be able to handle instructions both while the graph is being
	- // constructed, and after the graph has been constructed, and
	- // - maintain proper ordering of definitions on the stack for each
	- // register reference:
	- // - if there are two or more related defs in IA (i.e. coming from
	- // the same machine operand), then only push one def on the stack,
	- // - if there are multiple unrelated defs of non-overlapping
	- // subregisters of S, then the stack for S will have both (in an
	- // unspecified order), but the order does not matter from the data-
	- // -flow perspective.
	-
	- for (NodeAddr<DefNode> DA : IA.Addr->members_if(IsDef, this)) {
	- if (Visited.count(DA.Id))
	- continue;
	- if (DA.Addr->getFlags() & NodeAttrs::Clobbering)
	- continue;
	-
	- NodeList Rel = getRelatedRefs(IA, DA);
	- NodeAddr<DefNode*> PDA = Rel.front();
	- RegisterRef RR = PDA.Addr->getRegRef(*this);
	-#ifndef NDEBUG
	- // Assert if the register is defined in two or more unrelated defs.
	- // This could happen if there are two or more def operands defining it.
	- if (!Defined.insert(RR.Reg).second) {
	- MachineInstr MI = NodeAddr<StmtNode>(IA).Addr->getCode();
	- dbgs() << "Multiple definitions of register: "
	- << Print<RegisterRef>(RR, this) << " in\n " << MI << "in "
	- << printMBBReference(*MI->getParent()) << '\n';
	- llvm_unreachable(nullptr);
	- }
	-#endif
	- // Push the definition on the stack for the register and all aliases.
	- // The def stack traversal in linkNodeUp will check the exact aliasing.
	- DefM[RR.Reg].push(DA);
	- for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
	- // Check that we don't push the same def twice.
	- assert(A != RR.Reg);
	- DefM[A].push(DA);
	- }
	- // Mark all the related defs as visited.
	- for (NodeAddr<NodeBase*> T : Rel)
	- Visited.insert(T.Id);
	- }
	-}
	-
	-// Return the list of all reference nodes related to RA, including RA itself.
	-// See "getNextRelated" for the meaning of a "related reference".
	-NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
	- NodeAddr<RefNode*> RA) const {
	- assert(IA.Id != 0 && RA.Id != 0);
	-
	- NodeList Refs;
	- NodeId Start = RA.Id;
	- do {
	- Refs.push_back(RA);
	- RA = getNextRelated(IA, RA);
	- } while (RA.Id != 0 && RA.Id != Start);
	- return Refs;
	-}
	-
	-// Clear all information in the graph.
	-void DataFlowGraph::reset() {
	- Memory.clear();
	- BlockNodes.clear();
	- Func = NodeAddr<FuncNode*>();
	-}
	-
	-// Return the next reference node in the instruction node IA that is related
	-// to RA. Conceptually, two reference nodes are related if they refer to the
	-// same instance of a register access, but differ in flags or other minor
	-// characteristics. Specific examples of related nodes are shadow reference
	-// nodes.
	-// Return the equivalent of nullptr if there are no more related references.
	-NodeAddr<RefNode> DataFlowGraph::getNextRelated(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA) const {
	- assert(IA.Id != 0 && RA.Id != 0);
	-
	- auto Related = [this,RA](NodeAddr<RefNode*> TA) -> bool {
	- if (TA.Addr->getKind() != RA.Addr->getKind())
	- return false;
	- if (TA.Addr->getRegRef(this) != RA.Addr->getRegRef(this))
	- return false;
	- return true;
	- };
	- auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
	- return Related(TA) &&
	- &RA.Addr->getOp() == &TA.Addr->getOp();
	- };
	- auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
	- if (!Related(TA))
	- return false;
	- if (TA.Addr->getKind() != NodeAttrs::Use)
	- return true;
	- // For phi uses, compare predecessor blocks.
	- const NodeAddr<const PhiUseNode*> TUA = TA;
	- const NodeAddr<const PhiUseNode*> RUA = RA;
	- return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor();
	- };
	-
	- RegisterRef RR = RA.Addr->getRegRef(*this);
	- if (IA.Addr->getKind() == NodeAttrs::Stmt)
	- return RA.Addr->getNextRef(RR, RelatedStmt, true, *this);
	- return RA.Addr->getNextRef(RR, RelatedPhi, true, *this);
	-}
	-
	-// Find the next node related to RA in IA that satisfies condition P.
	-// If such a node was found, return a pair where the second element is the
	-// located node. If such a node does not exist, return a pair where the
	-// first element is the element after which such a node should be inserted,
	-// and the second element is a null-address.
	-template <typename Predicate>
	-std::pair<NodeAddr<RefNode>,NodeAddr<RefNode>>
	-DataFlowGraph::locateNextRef(NodeAddr<InstrNode> IA, NodeAddr<RefNode> RA,
	- Predicate P) const {
	- assert(IA.Id != 0 && RA.Id != 0);
	-
	- NodeAddr<RefNode*> NA;
	- NodeId Start = RA.Id;
	- while (true) {
	- NA = getNextRelated(IA, RA);
	- if (NA.Id == 0 \|\| NA.Id == Start)
	- break;
	- if (P(NA))
	- break;
	- RA = NA;
	- }
	-
	- if (NA.Id != 0 && NA.Id != Start)
	- return std::make_pair(RA, NA);
	- return std::make_pair(RA, NodeAddr<RefNode*>());
	-}
	-
	-// Get the next shadow node in IA corresponding to RA, and optionally create
	-// such a node if it does not exist.
	-NodeAddr<RefNode> DataFlowGraph::getNextShadow(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA, bool Create) {
	- assert(IA.Id != 0 && RA.Id != 0);
	-
	- uint16_t Flags = RA.Addr->getFlags() \| NodeAttrs::Shadow;
	- auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
	- return TA.Addr->getFlags() == Flags;
	- };
	- auto Loc = locateNextRef(IA, RA, IsShadow);
	- if (Loc.second.Id != 0 \|\| !Create)
	- return Loc.second;
	-
	- // Create a copy of RA and mark is as shadow.
	- NodeAddr<RefNode*> NA = cloneNode(RA);
	- NA.Addr->setFlags(Flags \| NodeAttrs::Shadow);
	- IA.Addr->addMemberAfter(Loc.first, NA, *this);
	- return NA;
	-}
	-
	-// Get the next shadow node in IA corresponding to RA. Return null-address
	-// if such a node does not exist.
	-NodeAddr<RefNode> DataFlowGraph::getNextShadow(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA) const {
	- assert(IA.Id != 0 && RA.Id != 0);
	- uint16_t Flags = RA.Addr->getFlags() \| NodeAttrs::Shadow;
	- auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
	- return TA.Addr->getFlags() == Flags;
	- };
	- return locateNextRef(IA, RA, IsShadow).second;
	-}
	-
	-// Create a new statement node in the block node BA that corresponds to
	-// the machine instruction MI.
	-void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
	- NodeAddr<StmtNode*> SA = newStmt(BA, &In);
	-
	- auto isCall = [] (const MachineInstr &In) -> bool {
	- if (In.isCall())
	- return true;
	- // Is tail call?
	- if (In.isBranch()) {
	- for (const MachineOperand &Op : In.operands())
	- if (Op.isGlobal() \|\| Op.isSymbol())
	- return true;
	- // Assume indirect branches are calls. This is for the purpose of
	- // keeping implicit operands, and so it won't hurt on intra-function
	- // indirect branches.
	- if (In.isIndirectBranch())
	- return true;
	- }
	- return false;
	- };
	-
	- auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool {
	- // This instruction defines DR. Check if there is a use operand that
	- // would make DR live on entry to the instruction.
	- for (const MachineOperand &Op : In.operands()) {
	- if (!Op.isReg() \|\| Op.getReg() == 0 \|\| !Op.isUse() \|\| Op.isUndef())
	- continue;
	- RegisterRef UR = makeRegRef(Op);
	- if (PRI.alias(DR, UR))
	- return false;
	- }
	- return true;
	- };
	-
	- bool IsCall = isCall(In);
	- unsigned NumOps = In.getNumOperands();
	-
	- // Avoid duplicate implicit defs. This will not detect cases of implicit
	- // defs that define registers that overlap, but it is not clear how to
	- // interpret that in the absence of explicit defs. Overlapping explicit
	- // defs are likely illegal already.
	- BitVector DoneDefs(TRI.getNumRegs());
	- // Process explicit defs first.
	- for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	- MachineOperand &Op = In.getOperand(OpN);
	- if (!Op.isReg() \|\| !Op.isDef() \|\| Op.isImplicit())
	- continue;
	- Register R = Op.getReg();
	- if (!R \|\| !Register::isPhysicalRegister(R))
	- continue;
	- uint16_t Flags = NodeAttrs::None;
	- if (TOI.isPreserving(In, OpN)) {
	- Flags \|= NodeAttrs::Preserving;
	- // If the def is preserving, check if it is also undefined.
	- if (isDefUndef(In, makeRegRef(Op)))
	- Flags \|= NodeAttrs::Undef;
	- }
	- if (TOI.isClobbering(In, OpN))
	- Flags \|= NodeAttrs::Clobbering;
	- if (TOI.isFixedReg(In, OpN))
	- Flags \|= NodeAttrs::Fixed;
	- if (IsCall && Op.isDead())
	- Flags \|= NodeAttrs::Dead;
	- NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
	- SA.Addr->addMember(DA, *this);
	- assert(!DoneDefs.test(R));
	- DoneDefs.set(R);
	- }
	-
	- // Process reg-masks (as clobbers).
	- BitVector DoneClobbers(TRI.getNumRegs());
	- for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	- MachineOperand &Op = In.getOperand(OpN);
	- if (!Op.isRegMask())
	- continue;
	- uint16_t Flags = NodeAttrs::Clobbering \| NodeAttrs::Fixed \|
	- NodeAttrs::Dead;
	- NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
	- SA.Addr->addMember(DA, *this);
	- // Record all clobbered registers in DoneDefs.
	- const uint32_t *RM = Op.getRegMask();
	- for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i)
	- if (!(RM[i/32] & (1u << (i%32))))
	- DoneClobbers.set(i);
	- }
	-
	- // Process implicit defs, skipping those that have already been added
	- // as explicit.
	- for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	- MachineOperand &Op = In.getOperand(OpN);
	- if (!Op.isReg() \|\| !Op.isDef() \|\| !Op.isImplicit())
	- continue;
	- Register R = Op.getReg();
	- if (!R \|\| !Register::isPhysicalRegister(R) \|\| DoneDefs.test(R))
	- continue;
	- RegisterRef RR = makeRegRef(Op);
	- uint16_t Flags = NodeAttrs::None;
	- if (TOI.isPreserving(In, OpN)) {
	- Flags \|= NodeAttrs::Preserving;
	- // If the def is preserving, check if it is also undefined.
	- if (isDefUndef(In, RR))
	- Flags \|= NodeAttrs::Undef;
	- }
	- if (TOI.isClobbering(In, OpN))
	- Flags \|= NodeAttrs::Clobbering;
	- if (TOI.isFixedReg(In, OpN))
	- Flags \|= NodeAttrs::Fixed;
	- if (IsCall && Op.isDead()) {
	- if (DoneClobbers.test(R))
	- continue;
	- Flags \|= NodeAttrs::Dead;
	- }
	- NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
	- SA.Addr->addMember(DA, *this);
	- DoneDefs.set(R);
	- }
	-
	- for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
	- MachineOperand &Op = In.getOperand(OpN);
	- if (!Op.isReg() \|\| !Op.isUse())
	- continue;
	- Register R = Op.getReg();
	- if (!R \|\| !Register::isPhysicalRegister(R))
	- continue;
	- uint16_t Flags = NodeAttrs::None;
	- if (Op.isUndef())
	- Flags \|= NodeAttrs::Undef;
	- if (TOI.isFixedReg(In, OpN))
	- Flags \|= NodeAttrs::Fixed;
	- NodeAddr<UseNode*> UA = newUse(SA, Op, Flags);
	- SA.Addr->addMember(UA, *this);
	- }
	-}
	-
	-// Scan all defs in the block node BA and record in PhiM the locations of
	-// phi nodes corresponding to these defs.
	-void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM,
	- NodeAddr<BlockNode*> BA) {
	- // Check all defs from block BA and record them in each block in BA's
	- // iterated dominance frontier. This information will later be used to
	- // create phi nodes.
	- MachineBasicBlock *BB = BA.Addr->getCode();
	- assert(BB);
	- auto DFLoc = MDF.find(BB);
	- if (DFLoc == MDF.end() \|\| DFLoc->second.empty())
	- return;
	-
	- // Traverse all instructions in the block and collect the set of all
	- // defined references. For each reference there will be a phi created
	- // in the block's iterated dominance frontier.
	- // This is done to make sure that each defined reference gets only one
	- // phi node, even if it is defined multiple times.
	- RegisterSet Defs;
	- for (NodeAddr<InstrNode> IA : BA.Addr->members(this))
	- for (NodeAddr<RefNode> RA : IA.Addr->members_if(IsDef, this))
	- Defs.insert(RA.Addr->getRegRef(*this));
	-
	- // Calculate the iterated dominance frontier of BB.
	- const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
	- SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end());
	- for (unsigned i = 0; i < IDF.size(); ++i) {
	- auto F = MDF.find(IDF[i]);
	- if (F != MDF.end())
	- IDF.insert(F->second.begin(), F->second.end());
	- }
	-
	- // Finally, add the set of defs to each block in the iterated dominance
	- // frontier.
	- for (auto DB : IDF) {
	- NodeAddr<BlockNode*> DBA = findBlock(DB);
	- PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
	- }
	-}
	-
	-// Given the locations of phi nodes in the map PhiM, create the phi nodes
	-// that are located in the block node BA.
	-void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
	- NodeAddr<BlockNode*> BA) {
	- // Check if this blocks has any DF defs, i.e. if there are any defs
	- // that this block is in the iterated dominance frontier of.
	- auto HasDF = PhiM.find(BA.Id);
	- if (HasDF == PhiM.end() \|\| HasDF->second.empty())
	- return;
	-
	- // First, remove all R in Refs in such that there exists T in Refs
	- // such that T covers R. In other words, only leave those refs that
	- // are not covered by another ref (i.e. maximal with respect to covering).
	-
	- auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
	- for (RegisterRef I : RRs)
	- if (I != RR && RegisterAggr::isCoverOf(I, RR, PRI))
	- RR = I;
	- return RR;
	- };
	-
	- RegisterSet MaxDF;
	- for (RegisterRef I : HasDF->second)
	- MaxDF.insert(MaxCoverIn(I, HasDF->second));
	-
	- std::vector<RegisterRef> MaxRefs;
	- for (RegisterRef I : MaxDF)
	- MaxRefs.push_back(MaxCoverIn(I, AllRefs));
	-
	- // Now, for each R in MaxRefs, get the alias closure of R. If the closure
	- // only has R in it, create a phi a def for R. Otherwise, create a phi,
	- // and add a def for each S in the closure.
	-
	- // Sort the refs so that the phis will be created in a deterministic order.
	- llvm::sort(MaxRefs);
	- // Remove duplicates.
	- auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
	- MaxRefs.erase(NewEnd, MaxRefs.end());
	-
	- auto Aliased = [this,&MaxRefs](RegisterRef RR,
	- std::vector<unsigned> &Closure) -> bool {
	- for (unsigned I : Closure)
	- if (PRI.alias(RR, MaxRefs[I]))
	- return true;
	- return false;
	- };
	-
	- // Prepare a list of NodeIds of the block's predecessors.
	- NodeList Preds;
	- const MachineBasicBlock *MBB = BA.Addr->getCode();
	- for (MachineBasicBlock *PB : MBB->predecessors())
	- Preds.push_back(findBlock(PB));
	-
	- while (!MaxRefs.empty()) {
	- // Put the first element in the closure, and then add all subsequent
	- // elements from MaxRefs to it, if they alias at least one element
	- // already in the closure.
	- // ClosureIdx: vector of indices in MaxRefs of members of the closure.
	- std::vector<unsigned> ClosureIdx = { 0 };
	- for (unsigned i = 1; i != MaxRefs.size(); ++i)
	- if (Aliased(MaxRefs[i], ClosureIdx))
	- ClosureIdx.push_back(i);
	-
	- // Build a phi for the closure.
	- unsigned CS = ClosureIdx.size();
	- NodeAddr<PhiNode*> PA = newPhi(BA);
	-
	- // Add defs.
	- for (unsigned X = 0; X != CS; ++X) {
	- RegisterRef RR = MaxRefs[ClosureIdx[X]];
	- uint16_t PhiFlags = NodeAttrs::PhiRef \| NodeAttrs::Preserving;
	- NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
	- PA.Addr->addMember(DA, *this);
	- }
	- // Add phi uses.
	- for (NodeAddr<BlockNode*> PBA : Preds) {
	- for (unsigned X = 0; X != CS; ++X) {
	- RegisterRef RR = MaxRefs[ClosureIdx[X]];
	- NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
	- PA.Addr->addMember(PUA, *this);
	- }
	- }
	-
	- // Erase from MaxRefs all elements in the closure.
	- auto Begin = MaxRefs.begin();
	- for (unsigned i = ClosureIdx.size(); i != 0; --i)
	- MaxRefs.erase(Begin + ClosureIdx[i-1]);
	- }
	-}
	-
	-// Remove any unneeded phi nodes that were created during the build process.
	-void DataFlowGraph::removeUnusedPhis() {
	- // This will remove unused phis, i.e. phis where each def does not reach
	- // any uses or other defs. This will not detect or remove circular phi
	- // chains that are otherwise dead. Unused/dead phis are created during
	- // the build process and this function is intended to remove these cases
	- // that are easily determinable to be unnecessary.
	-
	- SetVector<NodeId> PhiQ;
	- for (NodeAddr<BlockNode> BA : Func.Addr->members(this)) {
	- for (auto P : BA.Addr->members_if(IsPhi, *this))
	- PhiQ.insert(P.Id);
	- }
	-
	- static auto HasUsedDef = [](NodeList &Ms) -> bool {
	- for (NodeAddr<NodeBase*> M : Ms) {
	- if (M.Addr->getKind() != NodeAttrs::Def)
	- continue;
	- NodeAddr<DefNode*> DA = M;
	- if (DA.Addr->getReachedDef() != 0 \|\| DA.Addr->getReachedUse() != 0)
	- return true;
	- }
	- return false;
	- };
	-
	- // Any phi, if it is removed, may affect other phis (make them dead).
	- // For each removed phi, collect the potentially affected phis and add
	- // them back to the queue.
	- while (!PhiQ.empty()) {
	- auto PA = addr<PhiNode*>(PhiQ[0]);
	- PhiQ.remove(PA.Id);
	- NodeList Refs = PA.Addr->members(*this);
	- if (HasUsedDef(Refs))
	- continue;
	- for (NodeAddr<RefNode*> RA : Refs) {
	- if (NodeId RD = RA.Addr->getReachingDef()) {
	- auto RDA = addr<DefNode*>(RD);
	- NodeAddr<InstrNode> OA = RDA.Addr->getOwner(this);
	- if (IsPhi(OA))
	- PhiQ.insert(OA.Id);
	- }
	- if (RA.Addr->isDef())
	- unlinkDef(RA, true);
	- else
	- unlinkUse(RA, true);
	- }
	- NodeAddr<BlockNode> BA = PA.Addr->getOwner(this);
	- BA.Addr->removeMember(PA, *this);
	- }
	-}
	-
	-// For a given reference node TA in an instruction node IA, connect the
	-// reaching def of TA to the appropriate def node. Create any shadow nodes
	-// as appropriate.
	-template <typename T>
	-void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
	- DefStack &DS) {
	- if (DS.empty())
	- return;
	- RegisterRef RR = TA.Addr->getRegRef(*this);
	- NodeAddr<T> TAP;
	-
	- // References from the def stack that have been examined so far.
	- RegisterAggr Defs(PRI);
	-
	- for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
	- RegisterRef QR = I->Addr->getRegRef(*this);
	-
	- // Skip all defs that are aliased to any of the defs that we have already
	- // seen. If this completes a cover of RR, stop the stack traversal.
	- bool Alias = Defs.hasAliasOf(QR);
	- bool Cover = Defs.insert(QR).hasCoverOf(RR);
	- if (Alias) {
	- if (Cover)
	- break;
	- continue;
	- }
	-
	- // The reaching def.
	- NodeAddr<DefNode> RDA = I;
	-
	- // Pick the reached node.
	- if (TAP.Id == 0) {
	- TAP = TA;
	- } else {
	- // Mark the existing ref as "shadow" and create a new shadow.
	- TAP.Addr->setFlags(TAP.Addr->getFlags() \| NodeAttrs::Shadow);
	- TAP = getNextShadow(IA, TAP, true);
	- }
	-
	- // Create the link.
	- TAP.Addr->linkToDef(TAP.Id, RDA);
	-
	- if (Cover)
	- break;
	- }
	-}
	-
	-// Create data-flow links for all reference nodes in the statement node SA.
	-template <typename Predicate>
	-void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA,
	- Predicate P) {
	-#ifndef NDEBUG
	- RegisterSet Defs;
	-#endif
	-
	- // Link all nodes (upwards in the data-flow) with their reaching defs.
	- for (NodeAddr<RefNode> RA : SA.Addr->members_if(P, this)) {
	- uint16_t Kind = RA.Addr->getKind();
	- assert(Kind == NodeAttrs::Def \|\| Kind == NodeAttrs::Use);
	- RegisterRef RR = RA.Addr->getRegRef(*this);
	-#ifndef NDEBUG
	- // Do not expect multiple defs of the same reference.
	- assert(Kind != NodeAttrs::Def \|\| !Defs.count(RR));
	- Defs.insert(RR);
	-#endif
	-
	- auto F = DefM.find(RR.Reg);
	- if (F == DefM.end())
	- continue;
	- DefStack &DS = F->second;
	- if (Kind == NodeAttrs::Use)
	- linkRefUp<UseNode*>(SA, RA, DS);
	- else if (Kind == NodeAttrs::Def)
	- linkRefUp<DefNode*>(SA, RA, DS);
	- else
	- llvm_unreachable("Unexpected node in instruction");
	- }
	-}
	-
	-// Create data-flow links for all instructions in the block node BA. This
	-// will include updating any phi nodes in BA.
	-void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
	- // Push block delimiters.
	- markBlock(BA.Id, DefM);
	-
	- auto IsClobber = [] (NodeAddr<RefNode*> RA) -> bool {
	- return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering);
	- };
	- auto IsNoClobber = [] (NodeAddr<RefNode*> RA) -> bool {
	- return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering);
	- };
	-
	- assert(BA.Addr && "block node address is needed to create a data-flow link");
	- // For each non-phi instruction in the block, link all the defs and uses
	- // to their reaching defs. For any member of the block (including phis),
	- // push the defs on the corresponding stacks.
	- for (NodeAddr<InstrNode> IA : BA.Addr->members(this)) {
	- // Ignore phi nodes here. They will be linked part by part from the
	- // predecessors.
	- if (IA.Addr->getKind() == NodeAttrs::Stmt) {
	- linkStmtRefs(DefM, IA, IsUse);
	- linkStmtRefs(DefM, IA, IsClobber);
	- }
	-
	- // Push the definitions on the stack.
	- pushClobbers(IA, DefM);
	-
	- if (IA.Addr->getKind() == NodeAttrs::Stmt)
	- linkStmtRefs(DefM, IA, IsNoClobber);
	-
	- pushDefs(IA, DefM);
	- }
	-
	- // Recursively process all children in the dominator tree.
	- MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
	- for (auto I : *N) {
	- MachineBasicBlock *SB = I->getBlock();
	- NodeAddr<BlockNode*> SBA = findBlock(SB);
	- linkBlockRefs(DefM, SBA);
	- }
	-
	- // Link the phi uses from the successor blocks.
	- auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool {
	- if (NA.Addr->getKind() != NodeAttrs::Use)
	- return false;
	- assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
	- NodeAddr<PhiUseNode*> PUA = NA;
	- return PUA.Addr->getPredecessor() == BA.Id;
	- };
	-
	- RegisterSet EHLiveIns = getLandingPadLiveIns();
	- MachineBasicBlock *MBB = BA.Addr->getCode();
	-
	- for (MachineBasicBlock *SB : MBB->successors()) {
	- bool IsEHPad = SB->isEHPad();
	- NodeAddr<BlockNode*> SBA = findBlock(SB);
	- for (NodeAddr<InstrNode> IA : SBA.Addr->members_if(IsPhi, this)) {
	- // Do not link phi uses for landing pad live-ins.
	- if (IsEHPad) {
	- // Find what register this phi is for.
	- NodeAddr<RefNode> RA = IA.Addr->getFirstMember(this);
	- assert(RA.Id != 0);
	- if (EHLiveIns.count(RA.Addr->getRegRef(*this)))
	- continue;
	- }
	- // Go over each phi use associated with MBB, and link it.
	- for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
	- NodeAddr<PhiUseNode*> PUA = U;
	- RegisterRef RR = PUA.Addr->getRegRef(*this);
	- linkRefUp<UseNode*>(IA, PUA, DefM[RR.Reg]);
	- }
	- }
	- }
	-
	- // Pop all defs from this block from the definition stacks.
	- releaseBlock(BA.Id, DefM);
	-}
	-
	-// Remove the use node UA from any data-flow and structural links.
	-void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) {
	- NodeId RD = UA.Addr->getReachingDef();
	- NodeId Sib = UA.Addr->getSibling();
	-
	- if (RD == 0) {
	- assert(Sib == 0);
	- return;
	- }
	-
	- auto RDA = addr<DefNode*>(RD);
	- auto TA = addr<UseNode*>(RDA.Addr->getReachedUse());
	- if (TA.Id == UA.Id) {
	- RDA.Addr->setReachedUse(Sib);
	- return;
	- }
	-
	- while (TA.Id != 0) {
	- NodeId S = TA.Addr->getSibling();
	- if (S == UA.Id) {
	- TA.Addr->setSibling(UA.Addr->getSibling());
	- return;
	- }
	- TA = addr<UseNode*>(S);
	- }
	-}
	-
	-// Remove the def node DA from any data-flow and structural links.
	-void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) {
	- //
	- // RD
	- // \| reached
	- // \| def
	- // :
	- // .
	- // +----+
	- // ... -- \| DA \| -- ... -- 0 : sibling chain of DA
	- // +----+
	- // \| \| reached
	- // \| : def
	- // \| .
	- // \| ... : Siblings (defs)
	- // \|
	- // : reached
	- // . use
	- // ... : sibling chain of reached uses
	-
	- NodeId RD = DA.Addr->getReachingDef();
	-
	- // Visit all siblings of the reached def and reset their reaching defs.
	- // Also, defs reached by DA are now "promoted" to being reached by RD,
	- // so all of them will need to be spliced into the sibling chain where
	- // DA belongs.
	- auto getAllNodes = [this] (NodeId N) -> NodeList {
	- NodeList Res;
	- while (N) {
	- auto RA = addr<RefNode*>(N);
	- // Keep the nodes in the exact sibling order.
	- Res.push_back(RA);
	- N = RA.Addr->getSibling();
	- }
	- return Res;
	- };
	- NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
	- NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
	-
	- if (RD == 0) {
	- for (NodeAddr<RefNode*> I : ReachedDefs)
	- I.Addr->setSibling(0);
	- for (NodeAddr<RefNode*> I : ReachedUses)
	- I.Addr->setSibling(0);
	- }
	- for (NodeAddr<DefNode*> I : ReachedDefs)
	- I.Addr->setReachingDef(RD);
	- for (NodeAddr<UseNode*> I : ReachedUses)
	- I.Addr->setReachingDef(RD);
	-
	- NodeId Sib = DA.Addr->getSibling();
	- if (RD == 0) {
	- assert(Sib == 0);
	- return;
	- }
	-
	- // Update the reaching def node and remove DA from the sibling list.
	- auto RDA = addr<DefNode*>(RD);
	- auto TA = addr<DefNode*>(RDA.Addr->getReachedDef());
	- if (TA.Id == DA.Id) {
	- // If DA is the first reached def, just update the RD's reached def
	- // to the DA's sibling.
	- RDA.Addr->setReachedDef(Sib);
	- } else {
	- // Otherwise, traverse the sibling list of the reached defs and remove
	- // DA from it.
	- while (TA.Id != 0) {
	- NodeId S = TA.Addr->getSibling();
	- if (S == DA.Id) {
	- TA.Addr->setSibling(Sib);
	- break;
	- }
	- TA = addr<DefNode*>(S);
	- }
	- }
	-
	- // Splice the DA's reached defs into the RDA's reached def chain.
	- if (!ReachedDefs.empty()) {
	- auto Last = NodeAddr<DefNode*>(ReachedDefs.back());
	- Last.Addr->setSibling(RDA.Addr->getReachedDef());
	- RDA.Addr->setReachedDef(ReachedDefs.front().Id);
	- }
	- // Splice the DA's reached uses into the RDA's reached use chain.
	- if (!ReachedUses.empty()) {
	- auto Last = NodeAddr<UseNode*>(ReachedUses.back());
	- Last.Addr->setSibling(RDA.Addr->getReachedUse());
	- RDA.Addr->setReachedUse(ReachedUses.front().Id);
	- }
	-}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.cpp
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.cpp (nonexistent)
	@@ -1,380 +0,0 @@
	-//===- RDFRegisters.cpp ---------------------------------------------------===//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-
	-#include "RDFRegisters.h"
	-#include "llvm/ADT/BitVector.h"
	-#include "llvm/CodeGen/MachineFunction.h"
	-#include "llvm/CodeGen/MachineInstr.h"
	-#include "llvm/CodeGen/MachineOperand.h"
	-#include "llvm/CodeGen/TargetRegisterInfo.h"
	-#include "llvm/MC/LaneBitmask.h"
	-#include "llvm/MC/MCRegisterInfo.h"
	-#include "llvm/Support/ErrorHandling.h"
	-#include "llvm/Support/raw_ostream.h"
	-#include <cassert>
	-#include <cstdint>
	-#include <set>
	-#include <utility>
	-
	-using namespace llvm;
	-using namespace rdf;
	-
	-PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
	- const MachineFunction &mf)
	- : TRI(tri) {
	- RegInfos.resize(TRI.getNumRegs());
	-
	- BitVector BadRC(TRI.getNumRegs());
	- for (const TargetRegisterClass *RC : TRI.regclasses()) {
	- for (MCPhysReg R : *RC) {
	- RegInfo &RI = RegInfos[R];
	- if (RI.RegClass != nullptr && !BadRC[R]) {
	- if (RC->LaneMask != RI.RegClass->LaneMask) {
	- BadRC.set(R);
	- RI.RegClass = nullptr;
	- }
	- } else
	- RI.RegClass = RC;
	- }
	- }
	-
	- UnitInfos.resize(TRI.getNumRegUnits());
	-
	- for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
	- if (UnitInfos[U].Reg != 0)
	- continue;
	- MCRegUnitRootIterator R(U, &TRI);
	- assert(R.isValid());
	- RegisterId F = *R;
	- ++R;
	- if (R.isValid()) {
	- UnitInfos[U].Mask = LaneBitmask::getAll();
	- UnitInfos[U].Reg = F;
	- } else {
	- for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) {
	- std::pair<uint32_t,LaneBitmask> P = *I;
	- UnitInfo &UI = UnitInfos[P.first];
	- UI.Reg = F;
	- if (P.second.any()) {
	- UI.Mask = P.second;
	- } else {
	- if (const TargetRegisterClass *RC = RegInfos[F].RegClass)
	- UI.Mask = RC->LaneMask;
	- else
	- UI.Mask = LaneBitmask::getAll();
	- }
	- }
	- }
	- }
	-
	- for (const uint32_t *RM : TRI.getRegMasks())
	- RegMasks.insert(RM);
	- for (const MachineBasicBlock &B : mf)
	- for (const MachineInstr &In : B)
	- for (const MachineOperand &Op : In.operands())
	- if (Op.isRegMask())
	- RegMasks.insert(Op.getRegMask());
	-
	- MaskInfos.resize(RegMasks.size()+1);
	- for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) {
	- BitVector PU(TRI.getNumRegUnits());
	- const uint32_t *MB = RegMasks.get(M);
	- for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
	- if (!(MB[i/32] & (1u << (i%32))))
	- continue;
	- for (MCRegUnitIterator U(i, &TRI); U.isValid(); ++U)
	- PU.set(*U);
	- }
	- MaskInfos[M].Units = PU.flip();
	- }
	-}
	-
	-RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
	- return RR;
	-}
	-
	-std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
	- // Do not include RR in the alias set.
	- std::set<RegisterId> AS;
	- assert(isRegMaskId(Reg) \|\| Register::isPhysicalRegister(Reg));
	- if (isRegMaskId(Reg)) {
	- // XXX SLOW
	- const uint32_t *MB = getRegMaskBits(Reg);
	- for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
	- if (MB[i/32] & (1u << (i%32)))
	- continue;
	- AS.insert(i);
	- }
	- for (const uint32_t *RM : RegMasks) {
	- RegisterId MI = getRegMaskId(RM);
	- if (MI != Reg && aliasMM(RegisterRef(Reg), RegisterRef(MI)))
	- AS.insert(MI);
	- }
	- return AS;
	- }
	-
	- for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
	- AS.insert(*AI);
	- for (const uint32_t *RM : RegMasks) {
	- RegisterId MI = getRegMaskId(RM);
	- if (aliasRM(RegisterRef(Reg), RegisterRef(MI)))
	- AS.insert(MI);
	- }
	- return AS;
	-}
	-
	-bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
	- assert(Register::isPhysicalRegister(RA.Reg));
	- assert(Register::isPhysicalRegister(RB.Reg));
	-
	- MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
	- MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
	- // Reg units are returned in the numerical order.
	- while (UMA.isValid() && UMB.isValid()) {
	- // Skip units that are masked off in RA.
	- std::pair<RegisterId,LaneBitmask> PA = *UMA;
	- if (PA.second.any() && (PA.second & RA.Mask).none()) {
	- ++UMA;
	- continue;
	- }
	- // Skip units that are masked off in RB.
	- std::pair<RegisterId,LaneBitmask> PB = *UMB;
	- if (PB.second.any() && (PB.second & RB.Mask).none()) {
	- ++UMB;
	- continue;
	- }
	-
	- if (PA.first == PB.first)
	- return true;
	- if (PA.first < PB.first)
	- ++UMA;
	- else if (PB.first < PA.first)
	- ++UMB;
	- }
	- return false;
	-}
	-
	-bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
	- assert(Register::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
	- const uint32_t *MB = getRegMaskBits(RM.Reg);
	- bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32));
	- // If the lane mask information is "full", e.g. when the given lane mask
	- // is a superset of the lane mask from the register class, check the regmask
	- // bit directly.
	- if (RR.Mask == LaneBitmask::getAll())
	- return !Preserved;
	- const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass;
	- if (RC != nullptr && (RR.Mask & RC->LaneMask) == RC->LaneMask)
	- return !Preserved;
	-
	- // Otherwise, check all subregisters whose lane mask overlaps the given
	- // mask. For each such register, if it is preserved by the regmask, then
	- // clear the corresponding bits in the given mask. If at the end, all
	- // bits have been cleared, the register does not alias the regmask (i.e.
	- // is it preserved by it).
	- LaneBitmask M = RR.Mask;
	- for (MCSubRegIndexIterator SI(RR.Reg, &TRI); SI.isValid(); ++SI) {
	- LaneBitmask SM = TRI.getSubRegIndexLaneMask(SI.getSubRegIndex());
	- if ((SM & RR.Mask).none())
	- continue;
	- unsigned SR = SI.getSubReg();
	- if (!(MB[SR/32] & (1u << (SR%32))))
	- continue;
	- // The subregister SR is preserved.
	- M &= ~SM;
	- if (M.none())
	- return false;
	- }
	-
	- return true;
	-}
	-
	-bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const {
	- assert(isRegMaskId(RM.Reg) && isRegMaskId(RN.Reg));
	- unsigned NumRegs = TRI.getNumRegs();
	- const uint32_t *BM = getRegMaskBits(RM.Reg);
	- const uint32_t *BN = getRegMaskBits(RN.Reg);
	-
	- for (unsigned w = 0, nw = NumRegs/32; w != nw; ++w) {
	- // Intersect the negations of both words. Disregard reg=0,
	- // i.e. 0th bit in the 0th word.
	- uint32_t C = ~BM[w] & ~BN[w];
	- if (w == 0)
	- C &= ~1;
	- if (C)
	- return true;
	- }
	-
	- // Check the remaining registers in the last word.
	- unsigned TailRegs = NumRegs % 32;
	- if (TailRegs == 0)
	- return false;
	- unsigned TW = NumRegs / 32;
	- uint32_t TailMask = (1u << TailRegs) - 1;
	- if (~BM[TW] & ~BN[TW] & TailMask)
	- return true;
	-
	- return false;
	-}
	-
	-RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, unsigned R) const {
	- if (RR.Reg == R)
	- return RR;
	- if (unsigned Idx = TRI.getSubRegIndex(R, RR.Reg))
	- return RegisterRef(R, TRI.composeSubRegIndexLaneMask(Idx, RR.Mask));
	- if (unsigned Idx = TRI.getSubRegIndex(RR.Reg, R)) {
	- const RegInfo &RI = RegInfos[R];
	- LaneBitmask RCM = RI.RegClass ? RI.RegClass->LaneMask
	- : LaneBitmask::getAll();
	- LaneBitmask M = TRI.reverseComposeSubRegIndexLaneMask(Idx, RR.Mask);
	- return RegisterRef(R, M & RCM);
	- }
	- llvm_unreachable("Invalid arguments: unrelated registers?");
	-}
	-
	-bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
	- if (PhysicalRegisterInfo::isRegMaskId(RR.Reg))
	- return Units.anyCommon(PRI.getMaskUnits(RR.Reg));
	-
	- for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
	- std::pair<uint32_t,LaneBitmask> P = *U;
	- if (P.second.none() \|\| (P.second & RR.Mask).any())
	- if (Units.test(P.first))
	- return true;
	- }
	- return false;
	-}
	-
	-bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
	- if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
	- BitVector T(PRI.getMaskUnits(RR.Reg));
	- return T.reset(Units).none();
	- }
	-
	- for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
	- std::pair<uint32_t,LaneBitmask> P = *U;
	- if (P.second.none() \|\| (P.second & RR.Mask).any())
	- if (!Units.test(P.first))
	- return false;
	- }
	- return true;
	-}
	-
	-RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
	- if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
	- Units \|= PRI.getMaskUnits(RR.Reg);
	- return *this;
	- }
	-
	- for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
	- std::pair<uint32_t,LaneBitmask> P = *U;
	- if (P.second.none() \|\| (P.second & RR.Mask).any())
	- Units.set(P.first);
	- }
	- return *this;
	-}
	-
	-RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
	- Units \|= RG.Units;
	- return *this;
	-}
	-
	-RegisterAggr &RegisterAggr::intersect(RegisterRef RR) {
	- return intersect(RegisterAggr(PRI).insert(RR));
	-}
	-
	-RegisterAggr &RegisterAggr::intersect(const RegisterAggr &RG) {
	- Units &= RG.Units;
	- return *this;
	-}
	-
	-RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
	- return clear(RegisterAggr(PRI).insert(RR));
	-}
	-
	-RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
	- Units.reset(RG.Units);
	- return *this;
	-}
	-
	-RegisterRef RegisterAggr::intersectWith(RegisterRef RR) const {
	- RegisterAggr T(PRI);
	- T.insert(RR).intersect(*this);
	- if (T.empty())
	- return RegisterRef();
	- RegisterRef NR = T.makeRegRef();
	- assert(NR);
	- return NR;
	-}
	-
	-RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
	- return RegisterAggr(PRI).insert(RR).clear(*this).makeRegRef();
	-}
	-
	-RegisterRef RegisterAggr::makeRegRef() const {
	- int U = Units.find_first();
	- if (U < 0)
	- return RegisterRef();
	-
	- auto AliasedRegs = [this] (uint32_t Unit, BitVector &Regs) {
	- for (MCRegUnitRootIterator R(Unit, &PRI.getTRI()); R.isValid(); ++R)
	- for (MCSuperRegIterator S(*R, &PRI.getTRI(), true); S.isValid(); ++S)
	- Regs.set(*S);
	- };
	-
	- // Find the set of all registers that are aliased to all the units
	- // in this aggregate.
	-
	- // Get all the registers aliased to the first unit in the bit vector.
	- BitVector Regs(PRI.getTRI().getNumRegs());
	- AliasedRegs(U, Regs);
	- U = Units.find_next(U);
	-
	- // For each other unit, intersect it with the set of all registers
	- // aliased that unit.
	- while (U >= 0) {
	- BitVector AR(PRI.getTRI().getNumRegs());
	- AliasedRegs(U, AR);
	- Regs &= AR;
	- U = Units.find_next(U);
	- }
	-
	- // If there is at least one register remaining, pick the first one,
	- // and consolidate the masks of all of its units contained in this
	- // aggregate.
	-
	- int F = Regs.find_first();
	- if (F <= 0)
	- return RegisterRef();
	-
	- LaneBitmask M;
	- for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) {
	- std::pair<uint32_t,LaneBitmask> P = *I;
	- if (Units.test(P.first))
	- M \|= P.second.none() ? LaneBitmask::getAll() : P.second;
	- }
	- return RegisterRef(F, M);
	-}
	-
	-void RegisterAggr::print(raw_ostream &OS) const {
	- OS << '{';
	- for (int U = Units.find_first(); U >= 0; U = Units.find_next(U))
	- OS << ' ' << printRegUnit(U, &PRI.getTRI());
	- OS << " }";
	-}
	-
	-RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG,
	- bool End)
	- : Owner(&RG) {
	- for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) {
	- RegisterRef R = RG.PRI.getRefForUnit(U);
	- Masks[R.Reg] \|= R.Mask;
	- }
	- Pos = End ? Masks.end() : Masks.begin();
	- Index = End ? Masks.size() : 0;
	-}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.cpp
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.h (nonexistent)
	@@ -1,968 +0,0 @@
	-//===- RDFGraph.h ------------------------------------------------ C++ --===//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-//
	-// Target-independent, SSA-based data flow graph for register data flow (RDF)
	-// for a non-SSA program representation (e.g. post-RA machine code).
	-//
	-//
	-// *** Introduction
	-//
	-// The RDF graph is a collection of nodes, each of which denotes some element
	-// of the program. There are two main types of such elements: code and refe-
	-// rences. Conceptually, "code" is something that represents the structure
	-// of the program, e.g. basic block or a statement, while "reference" is an
	-// instance of accessing a register, e.g. a definition or a use. Nodes are
	-// connected with each other based on the structure of the program (such as
	-// blocks, instructions, etc.), and based on the data flow (e.g. reaching
	-// definitions, reached uses, etc.). The single-reaching-definition principle
	-// of SSA is generally observed, although, due to the non-SSA representation
	-// of the program, there are some differences between the graph and a "pure"
	-// SSA representation.
	-//
	-//
	-// *** Implementation remarks
	-//
	-// Since the graph can contain a large number of nodes, memory consumption
	-// was one of the major design considerations. As a result, there is a single
	-// base class NodeBase which defines all members used by all possible derived
	-// classes. The members are arranged in a union, and a derived class cannot
	-// add any data members of its own. Each derived class only defines the
	-// functional interface, i.e. member functions. NodeBase must be a POD,
	-// which implies that all of its members must also be PODs.
	-// Since nodes need to be connected with other nodes, pointers have been
	-// replaced with 32-bit identifiers: each node has an id of type NodeId.
	-// There are mapping functions in the graph that translate between actual
	-// memory addresses and the corresponding identifiers.
	-// A node id of 0 is equivalent to nullptr.
	-//
	-//
	-// *** Structure of the graph
	-//
	-// A code node is always a collection of other nodes. For example, a code
	-// node corresponding to a basic block will contain code nodes corresponding
	-// to instructions. In turn, a code node corresponding to an instruction will
	-// contain a list of reference nodes that correspond to the definitions and
	-// uses of registers in that instruction. The members are arranged into a
	-// circular list, which is yet another consequence of the effort to save
	-// memory: for each member node it should be possible to obtain its owner,
	-// and it should be possible to access all other members. There are other
	-// ways to accomplish that, but the circular list seemed the most natural.
	-//
	-// +- CodeNode -+
	-// \| \| <---------------------------------------------------+
	-// +-+--------+-+ \|
	-// \|FirstM \|LastM \|
	-// \| +-------------------------------------+ \|
	-// \| \| \|
	-// V V \|
	-// +----------+ Next +----------+ Next Next +----------+ Next \|
	-// \| \|----->\| \|-----> ... ----->\| \|----->-+
	-// +- Member -+ +- Member -+ +- Member -+
	-//
	-// The order of members is such that related reference nodes (see below)
	-// should be contiguous on the member list.
	-//
	-// A reference node is a node that encapsulates an access to a register,
	-// in other words, data flowing into or out of a register. There are two
	-// major kinds of reference nodes: defs and uses. A def node will contain
	-// the id of the first reached use, and the id of the first reached def.
	-// Each def and use will contain the id of the reaching def, and also the
	-// id of the next reached def (for def nodes) or use (for use nodes).
	-// The "next node sharing the same reaching def" is denoted as "sibling".
	-// In summary:
	-// - Def node contains: reaching def, sibling, first reached def, and first
	-// reached use.
	-// - Use node contains: reaching def and sibling.
	-//
	-// +-- DefNode --+
	-// \| R2 = ... \| <---+--------------------+
	-// ++---------+--+ \| \|
	-// \|Reached \|Reached \| \|
	-// \|Def \|Use \| \|
	-// \| \| \|Reaching \|Reaching
	-// \| V \|Def \|Def
	-// \| +-- UseNode --+ Sib +-- UseNode --+ Sib Sib
	-// \| \| ... = R2 \|----->\| ... = R2 \|----> ... ----> 0
	-// \| +-------------+ +-------------+
	-// V
	-// +-- DefNode --+ Sib
	-// \| R2 = ... \|----> ...
	-// ++---------+--+
	-// \| \|
	-// \| \|
	-// ... ...
	-//
	-// To get a full picture, the circular lists connecting blocks within a
	-// function, instructions within a block, etc. should be superimposed with
	-// the def-def, def-use links shown above.
	-// To illustrate this, consider a small example in a pseudo-assembly:
	-// foo:
	-// add r2, r0, r1 ; r2 = r0+r1
	-// addi r0, r2, 1 ; r0 = r2+1
	-// ret r0 ; return value in r0
	-//
	-// The graph (in a format used by the debugging functions) would look like:
	-//
	-// DFG dump:[
	-// f1: Function foo
	-// b2: === %bb.0 === preds(0), succs(0):
	-// p3: phi [d4<r0>(,d12,u9):]
	-// p5: phi [d6<r1>(,,u10):]
	-// s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
	-// s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
	-// s14: ret [u15<r0>(d12):]
	-// ]
	-//
	-// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
	-// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
	-// ment, d - def, u - use).
	-// The format of a def node is:
	-// dN<R>(rd,d,u):sib,
	-// where
	-// N - numeric node id,
	-// R - register being defined
	-// rd - reaching def,
	-// d - reached def,
	-// u - reached use,
	-// sib - sibling.
	-// The format of a use node is:
	-// uN<R>[!](rd):sib,
	-// where
	-// N - numeric node id,
	-// R - register being used,
	-// rd - reaching def,
	-// sib - sibling.
	-// Possible annotations (usually preceding the node id):
	-// + - preserving def,
	-// ~ - clobbering def,
	-// " - shadow ref (follows the node id),
	-// ! - fixed register (appears after register name).
	-//
	-// The circular lists are not explicit in the dump.
	-//
	-//
	-// *** Node attributes
	-//
	-// NodeBase has a member "Attrs", which is the primary way of determining
	-// the node's characteristics. The fields in this member decide whether
	-// the node is a code node or a reference node (i.e. node's "type"), then
	-// within each type, the "kind" determines what specifically this node
	-// represents. The remaining bits, "flags", contain additional information
	-// that is even more detailed than the "kind".
	-// CodeNode's kinds are:
	-// - Phi: Phi node, members are reference nodes.
	-// - Stmt: Statement, members are reference nodes.
	-// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
	-// - Func: The whole function. The members are basic block nodes.
	-// RefNode's kinds are:
	-// - Use.
	-// - Def.
	-//
	-// Meaning of flags:
	-// - Preserving: applies only to defs. A preserving def is one that can
	-// preserve some of the original bits among those that are included in
	-// the register associated with that def. For example, if R0 is a 32-bit
	-// register, but a def can only change the lower 16 bits, then it will
	-// be marked as preserving.
	-// - Shadow: a reference that has duplicates holding additional reaching
	-// defs (see more below).
	-// - Clobbering: applied only to defs, indicates that the value generated
	-// by this def is unspecified. A typical example would be volatile registers
	-// after function calls.
	-// - Fixed: the register in this def/use cannot be replaced with any other
	-// register. A typical case would be a parameter register to a call, or
	-// the register with the return value from a function.
	-// - Undef: the register in this reference the register is assumed to have
	-// no pre-existing value, even if it appears to be reached by some def.
	-// This is typically used to prevent keeping registers artificially live
	-// in cases when they are defined via predicated instructions. For example:
	-// r0 = add-if-true cond, r10, r11 (1)
	-// r0 = add-if-false cond, r12, r13, implicit r0 (2)
	-// ... = r0 (3)
	-// Before (1), r0 is not intended to be live, and the use of r0 in (3) is
	-// not meant to be reached by any def preceding (1). However, since the
	-// defs in (1) and (2) are both preserving, these properties alone would
	-// imply that the use in (3) may indeed be reached by some prior def.
	-// Adding Undef flag to the def in (1) prevents that. The Undef flag
	-// may be applied to both defs and uses.
	-// - Dead: applies only to defs. The value coming out of a "dead" def is
	-// assumed to be unused, even if the def appears to be reaching other defs
	-// or uses. The motivation for this flag comes from dead defs on function
	-// calls: there is no way to determine if such a def is dead without
	-// analyzing the target's ABI. Hence the graph should contain this info,
	-// as it is unavailable otherwise. On the other hand, a def without any
	-// uses on a typical instruction is not the intended target for this flag.
	-//
	-// *** Shadow references
	-//
	-// It may happen that a super-register can have two (or more) non-overlapping
	-// sub-registers. When both of these sub-registers are defined and followed
	-// by a use of the super-register, the use of the super-register will not
	-// have a unique reaching def: both defs of the sub-registers need to be
	-// accounted for. In such cases, a duplicate use of the super-register is
	-// added and it points to the extra reaching def. Both uses are marked with
	-// a flag "shadow". Example:
	-// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
	-// set r0, 1 ; r0 = 1
	-// set r1, 1 ; r1 = 1
	-// addi t1, t0, 1 ; t1 = t0+1
	-//
	-// The DFG:
	-// s1: set [d2<r0>(,,u9):]
	-// s3: set [d4<r1>(,,u10):]
	-// s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
	-//
	-// The statement s5 has two use nodes for t0: u7" and u9". The quotation
	-// mark " indicates that the node is a shadow.
	-//
	-
	-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
	-#define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
	-
	-#include "RDFRegisters.h"
	-#include "llvm/ADT/SmallVector.h"
	-#include "llvm/MC/LaneBitmask.h"
	-#include "llvm/Support/Allocator.h"
	-#include "llvm/Support/MathExtras.h"
	-#include <cassert>
	-#include <cstdint>
	-#include <cstring>
	-#include <map>
	-#include <set>
	-#include <unordered_map>
	-#include <utility>
	-#include <vector>
	-
	-// RDF uses uint32_t to refer to registers. This is to ensure that the type
	-// size remains specific. In other places, registers are often stored using
	-// unsigned.
	-static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
	-
	-namespace llvm {
	-
	-class MachineBasicBlock;
	-class MachineDominanceFrontier;
	-class MachineDominatorTree;
	-class MachineFunction;
	-class MachineInstr;
	-class MachineOperand;
	-class raw_ostream;
	-class TargetInstrInfo;
	-class TargetRegisterInfo;
	-
	-namespace rdf {
	-
	- using NodeId = uint32_t;
	-
	- struct DataFlowGraph;
	-
	- struct NodeAttrs {
	- enum : uint16_t {
	- None = 0x0000, // Nothing
	-
	- // Types: 2 bits
	- TypeMask = 0x0003,
	- Code = 0x0001, // 01, Container
	- Ref = 0x0002, // 10, Reference
	-
	- // Kind: 3 bits
	- KindMask = 0x0007 << 2,
	- Def = 0x0001 << 2, // 001
	- Use = 0x0002 << 2, // 010
	- Phi = 0x0003 << 2, // 011
	- Stmt = 0x0004 << 2, // 100
	- Block = 0x0005 << 2, // 101
	- Func = 0x0006 << 2, // 110
	-
	- // Flags: 7 bits for now
	- FlagMask = 0x007F << 5,
	- Shadow = 0x0001 << 5, // 0000001, Has extra reaching defs.
	- Clobbering = 0x0002 << 5, // 0000010, Produces unspecified values.
	- PhiRef = 0x0004 << 5, // 0000100, Member of PhiNode.
	- Preserving = 0x0008 << 5, // 0001000, Def can keep original bits.
	- Fixed = 0x0010 << 5, // 0010000, Fixed register.
	- Undef = 0x0020 << 5, // 0100000, Has no pre-existing value.
	- Dead = 0x0040 << 5, // 1000000, Does not define a value.
	- };
	-
	- static uint16_t type(uint16_t T) { return T & TypeMask; }
	- static uint16_t kind(uint16_t T) { return T & KindMask; }
	- static uint16_t flags(uint16_t T) { return T & FlagMask; }
	-
	- static uint16_t set_type(uint16_t A, uint16_t T) {
	- return (A & ~TypeMask) \| T;
	- }
	-
	- static uint16_t set_kind(uint16_t A, uint16_t K) {
	- return (A & ~KindMask) \| K;
	- }
	-
	- static uint16_t set_flags(uint16_t A, uint16_t F) {
	- return (A & ~FlagMask) \| F;
	- }
	-
	- // Test if A contains B.
	- static bool contains(uint16_t A, uint16_t B) {
	- if (type(A) != Code)
	- return false;
	- uint16_t KB = kind(B);
	- switch (kind(A)) {
	- case Func:
	- return KB == Block;
	- case Block:
	- return KB == Phi \|\| KB == Stmt;
	- case Phi:
	- case Stmt:
	- return type(B) == Ref;
	- }
	- return false;
	- }
	- };
	-
	- struct BuildOptions {
	- enum : unsigned {
	- None = 0x00,
	- KeepDeadPhis = 0x01, // Do not remove dead phis during build.
	- };
	- };
	-
	- template <typename T> struct NodeAddr {
	- NodeAddr() = default;
	- NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
	-
	- // Type cast (casting constructor). The reason for having this class
	- // instead of std::pair.
	- template <typename S> NodeAddr(const NodeAddr<S> &NA)
	- : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
	-
	- bool operator== (const NodeAddr<T> &NA) const {
	- assert((Addr == NA.Addr) == (Id == NA.Id));
	- return Addr == NA.Addr;
	- }
	- bool operator!= (const NodeAddr<T> &NA) const {
	- return !operator==(NA);
	- }
	-
	- T Addr = nullptr;
	- NodeId Id = 0;
	- };
	-
	- struct NodeBase;
	-
	- // Fast memory allocation and translation between node id and node address.
	- // This is really the same idea as the one underlying the "bump pointer
	- // allocator", the difference being in the translation. A node id is
	- // composed of two components: the index of the block in which it was
	- // allocated, and the index within the block. With the default settings,
	- // where the number of nodes per block is 4096, the node id (minus 1) is:
	- //
	- // bit position: 11 0
	- // +----------------------------+--------------+
	- // \| Index of the block \|Index in block\|
	- // +----------------------------+--------------+
	- //
	- // The actual node id is the above plus 1, to avoid creating a node id of 0.
	- //
	- // This method significantly improved the build time, compared to using maps
	- // (std::unordered_map or DenseMap) to translate between pointers and ids.
	- struct NodeAllocator {
	- // Amount of storage for a single node.
	- enum { NodeMemSize = 32 };
	-
	- NodeAllocator(uint32_t NPB = 4096)
	- : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
	- IndexMask((1 << BitsPerIndex)-1) {
	- assert(isPowerOf2_32(NPB));
	- }
	-
	- NodeBase *ptr(NodeId N) const {
	- uint32_t N1 = N-1;
	- uint32_t BlockN = N1 >> BitsPerIndex;
	- uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
	- return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
	- }
	-
	- NodeId id(const NodeBase *P) const;
	- NodeAddr<NodeBase*> New();
	- void clear();
	-
	- private:
	- void startNewBlock();
	- bool needNewBlock();
	-
	- uint32_t makeId(uint32_t Block, uint32_t Index) const {
	- // Add 1 to the id, to avoid the id of 0, which is treated as "null".
	- return ((Block << BitsPerIndex) \| Index) + 1;
	- }
	-
	- const uint32_t NodesPerBlock;
	- const uint32_t BitsPerIndex;
	- const uint32_t IndexMask;
	- char *ActiveEnd = nullptr;
	- std::vector<char*> Blocks;
	- using AllocatorTy = BumpPtrAllocatorImpl<MallocAllocator, 65536>;
	- AllocatorTy MemPool;
	- };
	-
	- using RegisterSet = std::set<RegisterRef>;
	-
	- struct TargetOperandInfo {
	- TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
	- virtual ~TargetOperandInfo() = default;
	-
	- virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
	- virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
	- virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
	-
	- const TargetInstrInfo &TII;
	- };
	-
	- // Packed register reference. Only used for storage.
	- struct PackedRegisterRef {
	- RegisterId Reg;
	- uint32_t MaskId;
	- };
	-
	- struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
	- LaneMaskIndex() = default;
	-
	- LaneBitmask getLaneMaskForIndex(uint32_t K) const {
	- return K == 0 ? LaneBitmask::getAll() : get(K);
	- }
	-
	- uint32_t getIndexForLaneMask(LaneBitmask LM) {
	- assert(LM.any());
	- return LM.all() ? 0 : insert(LM);
	- }
	-
	- uint32_t getIndexForLaneMask(LaneBitmask LM) const {
	- assert(LM.any());
	- return LM.all() ? 0 : find(LM);
	- }
	- };
	-
	- struct NodeBase {
	- public:
	- // Make sure this is a POD.
	- NodeBase() = default;
	-
	- uint16_t getType() const { return NodeAttrs::type(Attrs); }
	- uint16_t getKind() const { return NodeAttrs::kind(Attrs); }
	- uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
	- NodeId getNext() const { return Next; }
	-
	- uint16_t getAttrs() const { return Attrs; }
	- void setAttrs(uint16_t A) { Attrs = A; }
	- void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
	-
	- // Insert node NA after "this" in the circular chain.
	- void append(NodeAddr<NodeBase*> NA);
	-
	- // Initialize all members to 0.
	- void init() { memset(this, 0, sizeof *this); }
	-
	- void setNext(NodeId N) { Next = N; }
	-
	- protected:
	- uint16_t Attrs;
	- uint16_t Reserved;
	- NodeId Next; // Id of the next node in the circular chain.
	- // Definitions of nested types. Using anonymous nested structs would make
	- // this class definition clearer, but unnamed structs are not a part of
	- // the standard.
	- struct Def_struct {
	- NodeId DD, DU; // Ids of the first reached def and use.
	- };
	- struct PhiU_struct {
	- NodeId PredB; // Id of the predecessor block for a phi use.
	- };
	- struct Code_struct {
	- void *CP; // Pointer to the actual code.
	- NodeId FirstM, LastM; // Id of the first member and last.
	- };
	- struct Ref_struct {
	- NodeId RD, Sib; // Ids of the reaching def and the sibling.
	- union {
	- Def_struct Def;
	- PhiU_struct PhiU;
	- };
	- union {
	- MachineOperand *Op; // Non-phi refs point to a machine operand.
	- PackedRegisterRef PR; // Phi refs store register info directly.
	- };
	- };
	-
	- // The actual payload.
	- union {
	- Ref_struct Ref;
	- Code_struct Code;
	- };
	- };
	- // The allocator allocates chunks of 32 bytes for each node. The fact that
	- // each node takes 32 bytes in memory is used for fast translation between
	- // the node id and the node address.
	- static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
	- "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
	-
	- using NodeList = SmallVector<NodeAddr<NodeBase *>, 4>;
	- using NodeSet = std::set<NodeId>;
	-
	- struct RefNode : public NodeBase {
	- RefNode() = default;
	-
	- RegisterRef getRegRef(const DataFlowGraph &G) const;
	-
	- MachineOperand &getOp() {
	- assert(!(getFlags() & NodeAttrs::PhiRef));
	- return *Ref.Op;
	- }
	-
	- void setRegRef(RegisterRef RR, DataFlowGraph &G);
	- void setRegRef(MachineOperand *Op, DataFlowGraph &G);
	-
	- NodeId getReachingDef() const {
	- return Ref.RD;
	- }
	- void setReachingDef(NodeId RD) {
	- Ref.RD = RD;
	- }
	-
	- NodeId getSibling() const {
	- return Ref.Sib;
	- }
	- void setSibling(NodeId Sib) {
	- Ref.Sib = Sib;
	- }
	-
	- bool isUse() const {
	- assert(getType() == NodeAttrs::Ref);
	- return getKind() == NodeAttrs::Use;
	- }
	-
	- bool isDef() const {
	- assert(getType() == NodeAttrs::Ref);
	- return getKind() == NodeAttrs::Def;
	- }
	-
	- template <typename Predicate>
	- NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
	- const DataFlowGraph &G);
	- NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
	- };
	-
	- struct DefNode : public RefNode {
	- NodeId getReachedDef() const {
	- return Ref.Def.DD;
	- }
	- void setReachedDef(NodeId D) {
	- Ref.Def.DD = D;
	- }
	- NodeId getReachedUse() const {
	- return Ref.Def.DU;
	- }
	- void setReachedUse(NodeId U) {
	- Ref.Def.DU = U;
	- }
	-
	- void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
	- };
	-
	- struct UseNode : public RefNode {
	- void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
	- };
	-
	- struct PhiUseNode : public UseNode {
	- NodeId getPredecessor() const {
	- assert(getFlags() & NodeAttrs::PhiRef);
	- return Ref.PhiU.PredB;
	- }
	- void setPredecessor(NodeId B) {
	- assert(getFlags() & NodeAttrs::PhiRef);
	- Ref.PhiU.PredB = B;
	- }
	- };
	-
	- struct CodeNode : public NodeBase {
	- template <typename T> T getCode() const {
	- return static_cast<T>(Code.CP);
	- }
	- void setCode(void *C) {
	- Code.CP = C;
	- }
	-
	- NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
	- NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
	- void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
	- void addMemberAfter(NodeAddr<NodeBase> MA, NodeAddr<NodeBase> NA,
	- const DataFlowGraph &G);
	- void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
	-
	- NodeList members(const DataFlowGraph &G) const;
	- template <typename Predicate>
	- NodeList members_if(Predicate P, const DataFlowGraph &G) const;
	- };
	-
	- struct InstrNode : public CodeNode {
	- NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
	- };
	-
	- struct PhiNode : public InstrNode {
	- MachineInstr *getCode() const {
	- return nullptr;
	- }
	- };
	-
	- struct StmtNode : public InstrNode {
	- MachineInstr *getCode() const {
	- return CodeNode::getCode<MachineInstr*>();
	- }
	- };
	-
	- struct BlockNode : public CodeNode {
	- MachineBasicBlock *getCode() const {
	- return CodeNode::getCode<MachineBasicBlock*>();
	- }
	-
	- void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
	- };
	-
	- struct FuncNode : public CodeNode {
	- MachineFunction *getCode() const {
	- return CodeNode::getCode<MachineFunction*>();
	- }
	-
	- NodeAddr<BlockNode> findBlock(const MachineBasicBlock BB,
	- const DataFlowGraph &G) const;
	- NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
	- };
	-
	- struct DataFlowGraph {
	- DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
	- const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
	- const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi);
	-
	- NodeBase *ptr(NodeId N) const;
	- template <typename T> T ptr(NodeId N) const {
	- return static_cast<T>(ptr(N));
	- }
	-
	- NodeId id(const NodeBase *P) const;
	-
	- template <typename T> NodeAddr<T> addr(NodeId N) const {
	- return { ptr<T>(N), N };
	- }
	-
	- NodeAddr<FuncNode*> getFunc() const { return Func; }
	- MachineFunction &getMF() const { return MF; }
	- const TargetInstrInfo &getTII() const { return TII; }
	- const TargetRegisterInfo &getTRI() const { return TRI; }
	- const PhysicalRegisterInfo &getPRI() const { return PRI; }
	- const MachineDominatorTree &getDT() const { return MDT; }
	- const MachineDominanceFrontier &getDF() const { return MDF; }
	- const RegisterAggr &getLiveIns() const { return LiveIns; }
	-
	- struct DefStack {
	- DefStack() = default;
	-
	- bool empty() const { return Stack.empty() \|\| top() == bottom(); }
	-
	- private:
	- using value_type = NodeAddr<DefNode *>;
	- struct Iterator {
	- using value_type = DefStack::value_type;
	-
	- Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
	- Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
	-
	- value_type operator*() const {
	- assert(Pos >= 1);
	- return DS.Stack[Pos-1];
	- }
	- const value_type *operator->() const {
	- assert(Pos >= 1);
	- return &DS.Stack[Pos-1];
	- }
	- bool operator==(const Iterator &It) const { return Pos == It.Pos; }
	- bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
	-
	- private:
	- friend struct DefStack;
	-
	- Iterator(const DefStack &S, bool Top);
	-
	- // Pos-1 is the index in the StorageType object that corresponds to
	- // the top of the DefStack.
	- const DefStack &DS;
	- unsigned Pos;
	- };
	-
	- public:
	- using iterator = Iterator;
	-
	- iterator top() const { return Iterator(*this, true); }
	- iterator bottom() const { return Iterator(*this, false); }
	- unsigned size() const;
	-
	- void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
	- void pop();
	- void start_block(NodeId N);
	- void clear_block(NodeId N);
	-
	- private:
	- friend struct Iterator;
	-
	- using StorageType = std::vector<value_type>;
	-
	- bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
	- return (P.Addr == nullptr) && (N == 0 \|\| P.Id == N);
	- }
	-
	- unsigned nextUp(unsigned P) const;
	- unsigned nextDown(unsigned P) const;
	-
	- StorageType Stack;
	- };
	-
	- // Make this std::unordered_map for speed of accessing elements.
	- // Map: Register (physical or virtual) -> DefStack
	- using DefStackMap = std::unordered_map<RegisterId, DefStack>;
	-
	- void build(unsigned Options = BuildOptions::None);
	- void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
	- void markBlock(NodeId B, DefStackMap &DefM);
	- void releaseBlock(NodeId B, DefStackMap &DefM);
	-
	- PackedRegisterRef pack(RegisterRef RR) {
	- return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
	- }
	- PackedRegisterRef pack(RegisterRef RR) const {
	- return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
	- }
	- RegisterRef unpack(PackedRegisterRef PR) const {
	- return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId));
	- }
	-
	- RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
	- RegisterRef makeRegRef(const MachineOperand &Op) const;
	- RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
	-
	- NodeAddr<RefNode> getNextRelated(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA) const;
	- NodeAddr<RefNode> getNextImp(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA, bool Create);
	- NodeAddr<RefNode> getNextImp(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA) const;
	- NodeAddr<RefNode> getNextShadow(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA, bool Create);
	- NodeAddr<RefNode> getNextShadow(NodeAddr<InstrNode> IA,
	- NodeAddr<RefNode*> RA) const;
	-
	- NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
	- NodeAddr<RefNode*> RA) const;
	-
	- NodeAddr<BlockNode> findBlock(MachineBasicBlock BB) const {
	- return BlockNodes.at(BB);
	- }
	-
	- void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
	- unlinkUseDF(UA);
	- if (RemoveFromOwner)
	- removeFromOwner(UA);
	- }
	-
	- void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
	- unlinkDefDF(DA);
	- if (RemoveFromOwner)
	- removeFromOwner(DA);
	- }
	-
	- // Some useful filters.
	- template <uint16_t Kind>
	- static bool IsRef(const NodeAddr<NodeBase*> BA) {
	- return BA.Addr->getType() == NodeAttrs::Ref &&
	- BA.Addr->getKind() == Kind;
	- }
	-
	- template <uint16_t Kind>
	- static bool IsCode(const NodeAddr<NodeBase*> BA) {
	- return BA.Addr->getType() == NodeAttrs::Code &&
	- BA.Addr->getKind() == Kind;
	- }
	-
	- static bool IsDef(const NodeAddr<NodeBase*> BA) {
	- return BA.Addr->getType() == NodeAttrs::Ref &&
	- BA.Addr->getKind() == NodeAttrs::Def;
	- }
	-
	- static bool IsUse(const NodeAddr<NodeBase*> BA) {
	- return BA.Addr->getType() == NodeAttrs::Ref &&
	- BA.Addr->getKind() == NodeAttrs::Use;
	- }
	-
	- static bool IsPhi(const NodeAddr<NodeBase*> BA) {
	- return BA.Addr->getType() == NodeAttrs::Code &&
	- BA.Addr->getKind() == NodeAttrs::Phi;
	- }
	-
	- static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
	- uint16_t Flags = DA.Addr->getFlags();
	- return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
	- }
	-
	- private:
	- void reset();
	-
	- RegisterSet getLandingPadLiveIns() const;
	-
	- NodeAddr<NodeBase*> newNode(uint16_t Attrs);
	- NodeAddr<NodeBase> cloneNode(const NodeAddr<NodeBase> B);
	- NodeAddr<UseNode> newUse(NodeAddr<InstrNode> Owner,
	- MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
	- NodeAddr<PhiUseNode> newPhiUse(NodeAddr<PhiNode> Owner,
	- RegisterRef RR, NodeAddr<BlockNode*> PredB,
	- uint16_t Flags = NodeAttrs::PhiRef);
	- NodeAddr<DefNode> newDef(NodeAddr<InstrNode> Owner,
	- MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
	- NodeAddr<DefNode> newDef(NodeAddr<InstrNode> Owner,
	- RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
	- NodeAddr<PhiNode> newPhi(NodeAddr<BlockNode> Owner);
	- NodeAddr<StmtNode> newStmt(NodeAddr<BlockNode> Owner,
	- MachineInstr *MI);
	- NodeAddr<BlockNode> newBlock(NodeAddr<FuncNode> Owner,
	- MachineBasicBlock *BB);
	- NodeAddr<FuncNode> newFunc(MachineFunction MF);
	-
	- template <typename Predicate>
	- std::pair<NodeAddr<RefNode>,NodeAddr<RefNode>>
	- locateNextRef(NodeAddr<InstrNode> IA, NodeAddr<RefNode> RA,
	- Predicate P) const;
	-
	- using BlockRefsMap = std::map<NodeId, RegisterSet>;
	-
	- void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
	- void recordDefsForDF(BlockRefsMap &PhiM, NodeAddr<BlockNode*> BA);
	- void buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
	- NodeAddr<BlockNode*> BA);
	- void removeUnusedPhis();
	-
	- void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
	- void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
	- template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
	- NodeAddr<T> TA, DefStack &DS);
	- template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
	- NodeAddr<StmtNode*> SA, Predicate P);
	- void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
	-
	- void unlinkUseDF(NodeAddr<UseNode*> UA);
	- void unlinkDefDF(NodeAddr<DefNode*> DA);
	-
	- void removeFromOwner(NodeAddr<RefNode*> RA) {
	- NodeAddr<InstrNode> IA = RA.Addr->getOwner(this);
	- IA.Addr->removeMember(RA, *this);
	- }
	-
	- MachineFunction &MF;
	- const TargetInstrInfo &TII;
	- const TargetRegisterInfo &TRI;
	- const PhysicalRegisterInfo PRI;
	- const MachineDominatorTree &MDT;
	- const MachineDominanceFrontier &MDF;
	- const TargetOperandInfo &TOI;
	-
	- RegisterAggr LiveIns;
	- NodeAddr<FuncNode*> Func;
	- NodeAllocator Memory;
	- // Local map: MachineBasicBlock -> NodeAddr<BlockNode*>
	- std::map<MachineBasicBlock,NodeAddr<BlockNode>> BlockNodes;
	- // Lane mask map.
	- LaneMaskIndex LMI;
	- }; // struct DataFlowGraph
	-
	- template <typename Predicate>
	- NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
	- bool NextOnly, const DataFlowGraph &G) {
	- // Get the "Next" reference in the circular list that references RR and
	- // satisfies predicate "Pred".
	- auto NA = G.addr<NodeBase*>(getNext());
	-
	- while (NA.Addr != this) {
	- if (NA.Addr->getType() == NodeAttrs::Ref) {
	- NodeAddr<RefNode*> RA = NA;
	- if (RA.Addr->getRegRef(G) == RR && P(NA))
	- return NA;
	- if (NextOnly)
	- break;
	- NA = G.addr<NodeBase*>(NA.Addr->getNext());
	- } else {
	- // We've hit the beginning of the chain.
	- assert(NA.Addr->getType() == NodeAttrs::Code);
	- NodeAddr<CodeNode*> CA = NA;
	- NA = CA.Addr->getFirstMember(G);
	- }
	- }
	- // Return the equivalent of "nullptr" if such a node was not found.
	- return NodeAddr<RefNode*>();
	- }
	-
	- template <typename Predicate>
	- NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
	- NodeList MM;
	- auto M = getFirstMember(G);
	- if (M.Id == 0)
	- return MM;
	-
	- while (M.Addr != this) {
	- if (P(M))
	- MM.push_back(M);
	- M = G.addr<NodeBase*>(M.Addr->getNext());
	- }
	- return MM;
	- }
	-
	- template <typename T>
	- struct Print {
	- Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
	-
	- const T &Obj;
	- const DataFlowGraph &G;
	- };
	-
	- template <typename T>
	- struct PrintNode : Print<NodeAddr<T>> {
	- PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
	- : Print<NodeAddr<T>>(x, g) {}
	- };
	-
	- raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<DefNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<UseNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS,
	- const Print<NodeAddr<PhiUseNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<RefNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<PhiNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS,
	- const Print<NodeAddr<StmtNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS,
	- const Print<NodeAddr<InstrNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS,
	- const Print<NodeAddr<BlockNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS,
	- const Print<NodeAddr<FuncNode *>> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P);
	- raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P);
	- raw_ostream &operator<<(raw_ostream &OS,
	- const Print<DataFlowGraph::DefStack> &P);
	-
	-} // end namespace rdf
	-
	-} // end namespace llvm
	-
	-#endif // LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFGraph.h
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.h (nonexistent)
	@@ -1,240 +0,0 @@
	-//===- RDFRegisters.h -------------------------------------------- C++ --===//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-
	-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
	-#define LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
	-
	-#include "llvm/ADT/BitVector.h"
	-#include "llvm/ADT/STLExtras.h"
	-#include "llvm/CodeGen/TargetRegisterInfo.h"
	-#include "llvm/MC/LaneBitmask.h"
	-#include <cassert>
	-#include <cstdint>
	-#include <map>
	-#include <set>
	-#include <vector>
	-
	-namespace llvm {
	-
	-class MachineFunction;
	-class raw_ostream;
	-
	-namespace rdf {
	-
	- using RegisterId = uint32_t;
	-
	- // Template class for a map translating uint32_t into arbitrary types.
	- // The map will act like an indexed set: upon insertion of a new object,
	- // it will automatically assign a new index to it. Index of 0 is treated
	- // as invalid and is never allocated.
	- template <typename T, unsigned N = 32>
	- struct IndexedSet {
	- IndexedSet() { Map.reserve(N); }
	-
	- T get(uint32_t Idx) const {
	- // Index Idx corresponds to Map[Idx-1].
	- assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
	- return Map[Idx-1];
	- }
	-
	- uint32_t insert(T Val) {
	- // Linear search.
	- auto F = llvm::find(Map, Val);
	- if (F != Map.end())
	- return F - Map.begin() + 1;
	- Map.push_back(Val);
	- return Map.size(); // Return actual_index + 1.
	- }
	-
	- uint32_t find(T Val) const {
	- auto F = llvm::find(Map, Val);
	- assert(F != Map.end());
	- return F - Map.begin() + 1;
	- }
	-
	- uint32_t size() const { return Map.size(); }
	-
	- using const_iterator = typename std::vector<T>::const_iterator;
	-
	- const_iterator begin() const { return Map.begin(); }
	- const_iterator end() const { return Map.end(); }
	-
	- private:
	- std::vector<T> Map;
	- };
	-
	- struct RegisterRef {
	- RegisterId Reg = 0;
	- LaneBitmask Mask = LaneBitmask::getNone();
	-
	- RegisterRef() = default;
	- explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
	- : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
	-
	- operator bool() const {
	- return Reg != 0 && Mask.any();
	- }
	-
	- bool operator== (const RegisterRef &RR) const {
	- return Reg == RR.Reg && Mask == RR.Mask;
	- }
	-
	- bool operator!= (const RegisterRef &RR) const {
	- return !operator==(RR);
	- }
	-
	- bool operator< (const RegisterRef &RR) const {
	- return Reg < RR.Reg \|\| (Reg == RR.Reg && Mask < RR.Mask);
	- }
	- };
	-
	-
	- struct PhysicalRegisterInfo {
	- PhysicalRegisterInfo(const TargetRegisterInfo &tri,
	- const MachineFunction &mf);
	-
	- static bool isRegMaskId(RegisterId R) {
	- return Register::isStackSlot(R);
	- }
	-
	- RegisterId getRegMaskId(const uint32_t *RM) const {
	- return Register::index2StackSlot(RegMasks.find(RM));
	- }
	-
	- const uint32_t *getRegMaskBits(RegisterId R) const {
	- return RegMasks.get(Register::stackSlot2Index(R));
	- }
	-
	- RegisterRef normalize(RegisterRef RR) const;
	-
	- bool alias(RegisterRef RA, RegisterRef RB) const {
	- if (!isRegMaskId(RA.Reg))
	- return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
	- return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
	- }
	-
	- std::set<RegisterId> getAliasSet(RegisterId Reg) const;
	-
	- RegisterRef getRefForUnit(uint32_t U) const {
	- return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask);
	- }
	-
	- const BitVector &getMaskUnits(RegisterId MaskId) const {
	- return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
	- }
	-
	- RegisterRef mapTo(RegisterRef RR, unsigned R) const;
	- const TargetRegisterInfo &getTRI() const { return TRI; }
	-
	- private:
	- struct RegInfo {
	- const TargetRegisterClass *RegClass = nullptr;
	- };
	- struct UnitInfo {
	- RegisterId Reg = 0;
	- LaneBitmask Mask;
	- };
	- struct MaskInfo {
	- BitVector Units;
	- };
	-
	- const TargetRegisterInfo &TRI;
	- IndexedSet<const uint32_t*> RegMasks;
	- std::vector<RegInfo> RegInfos;
	- std::vector<UnitInfo> UnitInfos;
	- std::vector<MaskInfo> MaskInfos;
	-
	- bool aliasRR(RegisterRef RA, RegisterRef RB) const;
	- bool aliasRM(RegisterRef RR, RegisterRef RM) const;
	- bool aliasMM(RegisterRef RM, RegisterRef RN) const;
	- };
	-
	- struct RegisterAggr {
	- RegisterAggr(const PhysicalRegisterInfo &pri)
	- : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
	- RegisterAggr(const RegisterAggr &RG) = default;
	-
	- bool empty() const { return Units.none(); }
	- bool hasAliasOf(RegisterRef RR) const;
	- bool hasCoverOf(RegisterRef RR) const;
	-
	- static bool isCoverOf(RegisterRef RA, RegisterRef RB,
	- const PhysicalRegisterInfo &PRI) {
	- return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
	- }
	-
	- RegisterAggr &insert(RegisterRef RR);
	- RegisterAggr &insert(const RegisterAggr &RG);
	- RegisterAggr &intersect(RegisterRef RR);
	- RegisterAggr &intersect(const RegisterAggr &RG);
	- RegisterAggr &clear(RegisterRef RR);
	- RegisterAggr &clear(const RegisterAggr &RG);
	-
	- RegisterRef intersectWith(RegisterRef RR) const;
	- RegisterRef clearIn(RegisterRef RR) const;
	- RegisterRef makeRegRef() const;
	-
	- void print(raw_ostream &OS) const;
	-
	- struct rr_iterator {
	- using MapType = std::map<RegisterId, LaneBitmask>;
	-
	- private:
	- MapType Masks;
	- MapType::iterator Pos;
	- unsigned Index;
	- const RegisterAggr *Owner;
	-
	- public:
	- rr_iterator(const RegisterAggr &RG, bool End);
	-
	- RegisterRef operator*() const {
	- return RegisterRef(Pos->first, Pos->second);
	- }
	-
	- rr_iterator &operator++() {
	- ++Pos;
	- ++Index;
	- return *this;
	- }
	-
	- bool operator==(const rr_iterator &I) const {
	- assert(Owner == I.Owner);
	- (void)Owner;
	- return Index == I.Index;
	- }
	-
	- bool operator!=(const rr_iterator &I) const {
	- return !(*this == I);
	- }
	- };
	-
	- rr_iterator rr_begin() const {
	- return rr_iterator(*this, false);
	- }
	- rr_iterator rr_end() const {
	- return rr_iterator(*this, true);
	- }
	-
	- private:
	- BitVector Units;
	- const PhysicalRegisterInfo &PRI;
	- };
	-
	- // Optionally print the lane mask, if it is not ~0.
	- struct PrintLaneMaskOpt {
	- PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
	- LaneBitmask Mask;
	- };
	- raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
	-
	-} // end namespace rdf
	-
	-} // end namespace llvm
	-
	-#endif // LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFRegisters.h
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.h (nonexistent)
	@@ -1,151 +0,0 @@
	-//===- RDFLiveness.h --------------------------------------------- C++ --===//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-//
	-// Recalculate the liveness information given a data flow graph.
	-// This includes block live-ins and kill flags.
	-
	-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
	-#define LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
	-
	-#include "RDFGraph.h"
	-#include "RDFRegisters.h"
	-#include "llvm/ADT/DenseMap.h"
	-#include "llvm/MC/LaneBitmask.h"
	-#include <map>
	-#include <set>
	-#include <utility>
	-
	-namespace llvm {
	-
	-class MachineBasicBlock;
	-class MachineDominanceFrontier;
	-class MachineDominatorTree;
	-class MachineRegisterInfo;
	-class TargetRegisterInfo;
	-
	-namespace rdf {
	-
	- struct Liveness {
	- public:
	- // This is really a std::map, except that it provides a non-trivial
	- // default constructor to the element accessed via [].
	- struct LiveMapType {
	- LiveMapType(const PhysicalRegisterInfo &pri) : Empty(pri) {}
	-
	- RegisterAggr &operator[] (MachineBasicBlock *B) {
	- return Map.emplace(B, Empty).first->second;
	- }
	-
	- private:
	- RegisterAggr Empty;
	- std::map<MachineBasicBlock*,RegisterAggr> Map;
	- };
	-
	- using NodeRef = std::pair<NodeId, LaneBitmask>;
	- using NodeRefSet = std::set<NodeRef>;
	- // RegisterId in RefMap must be normalized.
	- using RefMap = std::map<RegisterId, NodeRefSet>;
	-
	- Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
	- : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
	- MDF(g.getDF()), LiveMap(g.getPRI()), Empty(), NoRegs(g.getPRI()) {}
	-
	- NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
	- bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
	-
	- NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
	- return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
	- false, NoRegs);
	- }
	-
	- NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
	- return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
	- }
	-
	- NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
	- const RegisterAggr &DefRRs);
	-
	- NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
	- return getAllReachedUses(RefRR, DefA, NoRegs);
	- }
	-
	- std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
	- NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
	-
	- NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
	- NodeAddr<InstrNode*> IA);
	-
	- LiveMapType &getLiveMap() { return LiveMap; }
	- const LiveMapType &getLiveMap() const { return LiveMap; }
	-
	- const RefMap &getRealUses(NodeId P) const {
	- auto F = RealUseMap.find(P);
	- return F == RealUseMap.end() ? Empty : F->second;
	- }
	-
	- void computePhiInfo();
	- void computeLiveIns();
	- void resetLiveIns();
	- void resetKills();
	- void resetKills(MachineBasicBlock *B);
	-
	- void trace(bool T) { Trace = T; }
	-
	- private:
	- const DataFlowGraph &DFG;
	- const TargetRegisterInfo &TRI;
	- const PhysicalRegisterInfo &PRI;
	- const MachineDominatorTree &MDT;
	- const MachineDominanceFrontier &MDF;
	- LiveMapType LiveMap;
	- const RefMap Empty;
	- const RegisterAggr NoRegs;
	- bool Trace = false;
	-
	- // Cache of mapping from node ids (for RefNodes) to the containing
	- // basic blocks. Not computing it each time for each node reduces
	- // the liveness calculation time by a large fraction.
	- using NodeBlockMap = DenseMap<NodeId, MachineBasicBlock *>;
	- NodeBlockMap NBMap;
	-
	- // Phi information:
	- //
	- // RealUseMap
	- // map: NodeId -> (map: RegisterId -> NodeRefSet)
	- // phi id -> (map: register -> set of reached non-phi uses)
	- std::map<NodeId, RefMap> RealUseMap;
	-
	- // Inverse iterated dominance frontier.
	- std::map<MachineBasicBlock,std::set<MachineBasicBlock>> IIDF;
	-
	- // Live on entry.
	- std::map<MachineBasicBlock*,RefMap> PhiLON;
	-
	- // Phi uses are considered to be located at the end of the block that
	- // they are associated with. The reaching def of a phi use dominates the
	- // block that the use corresponds to, but not the block that contains
	- // the phi itself. To include these uses in the liveness propagation (up
	- // the dominator tree), create a map: block -> set of uses live on exit.
	- std::map<MachineBasicBlock*,RefMap> PhiLOX;
	-
	- MachineBasicBlock *getBlockWithRef(NodeId RN) const;
	- void traverse(MachineBasicBlock *B, RefMap &LiveIn);
	- void emptify(RefMap &M);
	-
	- std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
	- NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
	- unsigned Nest, unsigned MaxNest);
	- };
	-
	- raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P);
	-
	-} // end namespace rdf
	-
	-} // end namespace llvm
	-
	-#endif // LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.h
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.cpp (nonexistent)
	@@ -1,1118 +0,0 @@
	-//===- RDFLiveness.cpp ----------------------------------------------------===//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-//
	-// Computation of the liveness information from the data-flow graph.
	-//
	-// The main functionality of this code is to compute block live-in
	-// information. With the live-in information in place, the placement
	-// of kill flags can also be recalculated.
	-//
	-// The block live-in calculation is based on the ideas from the following
	-// publication:
	-//
	-// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
	-// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
	-// ACM Transactions on Architecture and Code Optimization, Association for
	-// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
	-// and Embedded Architectures and Compilers", 8 (4),
	-// <10.1145/2086696.2086706>. <hal-00647369>
	-//
	-#include "RDFLiveness.h"
	-#include "RDFGraph.h"
	-#include "RDFRegisters.h"
	-#include "llvm/ADT/BitVector.h"
	-#include "llvm/ADT/STLExtras.h"
	-#include "llvm/ADT/SetVector.h"
	-#include "llvm/CodeGen/MachineBasicBlock.h"
	-#include "llvm/CodeGen/MachineDominanceFrontier.h"
	-#include "llvm/CodeGen/MachineDominators.h"
	-#include "llvm/CodeGen/MachineFunction.h"
	-#include "llvm/CodeGen/MachineInstr.h"
	-#include "llvm/CodeGen/TargetRegisterInfo.h"
	-#include "llvm/MC/LaneBitmask.h"
	-#include "llvm/MC/MCRegisterInfo.h"
	-#include "llvm/Support/CommandLine.h"
	-#include "llvm/Support/Debug.h"
	-#include "llvm/Support/ErrorHandling.h"
	-#include "llvm/Support/raw_ostream.h"
	-#include <algorithm>
	-#include <cassert>
	-#include <cstdint>
	-#include <iterator>
	-#include <map>
	-#include <utility>
	-#include <vector>
	-
	-using namespace llvm;
	-using namespace rdf;
	-
	-static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
	- cl::Hidden, cl::desc("Maximum recursion level"));
	-
	-namespace llvm {
	-namespace rdf {
	-
	- raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
	- OS << '{';
	- for (auto &I : P.Obj) {
	- OS << ' ' << printReg(I.first, &P.G.getTRI()) << '{';
	- for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
	- OS << Print<NodeId>(J->first, P.G) << PrintLaneMaskOpt(J->second);
	- if (++J != E)
	- OS << ',';
	- }
	- OS << '}';
	- }
	- OS << " }";
	- return OS;
	- }
	-
	-} // end namespace rdf
	-} // end namespace llvm
	-
	-// The order in the returned sequence is the order of reaching defs in the
	-// upward traversal: the first def is the closest to the given reference RefA,
	-// the next one is further up, and so on.
	-// The list ends at a reaching phi def, or when the reference from RefA is
	-// covered by the defs in the list (see FullChain).
	-// This function provides two modes of operation:
	-// (1) Returning the sequence of reaching defs for a particular reference
	-// node. This sequence will terminate at the first phi node [1].
	-// (2) Returning a partial sequence of reaching defs, where the final goal
	-// is to traverse past phi nodes to the actual defs arising from the code
	-// itself.
	-// In mode (2), the register reference for which the search was started
	-// may be different from the reference node RefA, for which this call was
	-// made, hence the argument RefRR, which holds the original register.
	-// Also, some definitions may have already been encountered in a previous
	-// call that will influence register covering. The register references
	-// already defined are passed in through DefRRs.
	-// In mode (1), the "continuation" considerations do not apply, and the
	-// RefRR is the same as the register in RefA, and the set DefRRs is empty.
	-//
	-// [1] It is possible for multiple phi nodes to be included in the returned
	-// sequence:
	-// SubA = phi ...
	-// SubB = phi ...
	-// ... = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
	-// However, these phi nodes are independent from one another in terms of
	-// the data-flow.
	-
	-NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
	- NodeAddr<RefNode*> RefA, bool TopShadows, bool FullChain,
	- const RegisterAggr &DefRRs) {
	- NodeList RDefs; // Return value.
	- SetVector<NodeId> DefQ;
	- SetVector<NodeId> Owners;
	-
	- // Dead defs will be treated as if they were live, since they are actually
	- // on the data-flow path. They cannot be ignored because even though they
	- // do not generate meaningful values, they still modify registers.
	-
	- // If the reference is undefined, there is nothing to do.
	- if (RefA.Addr->getFlags() & NodeAttrs::Undef)
	- return RDefs;
	-
	- // The initial queue should not have reaching defs for shadows. The
	- // whole point of a shadow is that it will have a reaching def that
	- // is not aliased to the reaching defs of the related shadows.
	- NodeId Start = RefA.Id;
	- auto SNA = DFG.addr<RefNode*>(Start);
	- if (NodeId RD = SNA.Addr->getReachingDef())
	- DefQ.insert(RD);
	- if (TopShadows) {
	- for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA))
	- if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
	- DefQ.insert(RD);
	- }
	-
	- // Collect all the reaching defs, going up until a phi node is encountered,
	- // or there are no more reaching defs. From this set, the actual set of
	- // reaching defs will be selected.
	- // The traversal upwards must go on until a covering def is encountered.
	- // It is possible that a collection of non-covering (individually) defs
	- // will be sufficient, but keep going until a covering one is found.
	- for (unsigned i = 0; i < DefQ.size(); ++i) {
	- auto TA = DFG.addr<DefNode*>(DefQ[i]);
	- if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
	- continue;
	- // Stop at the covering/overwriting def of the initial register reference.
	- RegisterRef RR = TA.Addr->getRegRef(DFG);
	- if (!DFG.IsPreservingDef(TA))
	- if (RegisterAggr::isCoverOf(RR, RefRR, PRI))
	- continue;
	- // Get the next level of reaching defs. This will include multiple
	- // reaching defs for shadows.
	- for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
	- if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
	- DefQ.insert(RD);
	- }
	-
	- // Remove all non-phi defs that are not aliased to RefRR, and collect
	- // the owners of the remaining defs.
	- SetVector<NodeId> Defs;
	- for (NodeId N : DefQ) {
	- auto TA = DFG.addr<DefNode*>(N);
	- bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
	- if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
	- continue;
	- Defs.insert(TA.Id);
	- Owners.insert(TA.Addr->getOwner(DFG).Id);
	- }
	-
	- // Return the MachineBasicBlock containing a given instruction.
	- auto Block = [this] (NodeAddr<InstrNode> IA) -> MachineBasicBlock {
	- if (IA.Addr->getKind() == NodeAttrs::Stmt)
	- return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent();
	- assert(IA.Addr->getKind() == NodeAttrs::Phi);
	- NodeAddr<PhiNode*> PA = IA;
	- NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
	- return BA.Addr->getCode();
	- };
	- // Less(A,B) iff instruction A is further down in the dominator tree than B.
	- auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
	- if (A == B)
	- return false;
	- auto OA = DFG.addr<InstrNode>(A), OB = DFG.addr<InstrNode>(B);
	- MachineBasicBlock BA = Block(OA), BB = Block(OB);
	- if (BA != BB)
	- return MDT.dominates(BB, BA);
	- // They are in the same block.
	- bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
	- bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
	- if (StmtA) {
	- if (!StmtB) // OB is a phi and phis dominate statements.
	- return true;
	- MachineInstr CA = NodeAddr<StmtNode>(OA).Addr->getCode();
	- MachineInstr CB = NodeAddr<StmtNode>(OB).Addr->getCode();
	- // The order must be linear, so tie-break such equalities.
	- if (CA == CB)
	- return A < B;
	- return MDT.dominates(CB, CA);
	- } else {
	- // OA is a phi.
	- if (StmtB)
	- return false;
	- // Both are phis. There is no ordering between phis (in terms of
	- // the data-flow), so tie-break this via node id comparison.
	- return A < B;
	- }
	- };
	-
	- std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
	- llvm::sort(Tmp, Less);
	-
	- // The vector is a list of instructions, so that defs coming from
	- // the same instruction don't need to be artificially ordered.
	- // Then, when computing the initial segment, and iterating over an
	- // instruction, pick the defs that contribute to the covering (i.e. is
	- // not covered by previously added defs). Check the defs individually,
	- // i.e. first check each def if is covered or not (without adding them
	- // to the tracking set), and then add all the selected ones.
	-
	- // The reason for this is this example:
	- // d1<A>, d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
	- // d3<C> If A \incl BuC, and B \incl AuC, then d2 would be
	- // covered if we added A first, and A would be covered
	- // if we added B first.
	-
	- RegisterAggr RRs(DefRRs);
	-
	- auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool {
	- return TA.Addr->getKind() == NodeAttrs::Def &&
	- Defs.count(TA.Id);
	- };
	- for (NodeId T : Tmp) {
	- if (!FullChain && RRs.hasCoverOf(RefRR))
	- break;
	- auto TA = DFG.addr<InstrNode*>(T);
	- bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
	- NodeList Ds;
	- for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) {
	- RegisterRef QR = DA.Addr->getRegRef(DFG);
	- // Add phi defs even if they are covered by subsequent defs. This is
	- // for cases where the reached use is not covered by any of the defs
	- // encountered so far: the phi def is needed to expose the liveness
	- // of that use to the entry of the block.
	- // Example:
	- // phi d1<R3>(,d2,), ... Phi def d1 is covered by d2.
	- // d2<R3>(d1,,u3), ...
	- // ..., u3<D1>(d2) This use needs to be live on entry.
	- if (FullChain \|\| IsPhi \|\| !RRs.hasCoverOf(QR))
	- Ds.push_back(DA);
	- }
	- RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
	- for (NodeAddr<DefNode*> DA : Ds) {
	- // When collecting a full chain of definitions, do not consider phi
	- // defs to actually define a register.
	- uint16_t Flags = DA.Addr->getFlags();
	- if (!FullChain \|\| !(Flags & NodeAttrs::PhiRef))
	- if (!(Flags & NodeAttrs::Preserving)) // Don't care about Undef here.
	- RRs.insert(DA.Addr->getRegRef(DFG));
	- }
	- }
	-
	- auto DeadP = [](const NodeAddr<DefNode*> DA) -> bool {
	- return DA.Addr->getFlags() & NodeAttrs::Dead;
	- };
	- RDefs.resize(std::distance(RDefs.begin(), llvm::remove_if(RDefs, DeadP)));
	-
	- return RDefs;
	-}
	-
	-std::pair<NodeSet,bool>
	-Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
	- NodeSet &Visited, const NodeSet &Defs) {
	- return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest);
	-}
	-
	-std::pair<NodeSet,bool>
	-Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
	- NodeSet &Visited, const NodeSet &Defs, unsigned Nest, unsigned MaxNest) {
	- if (Nest > MaxNest)
	- return { NodeSet(), false };
	- // Collect all defined registers. Do not consider phis to be defining
	- // anything, only collect "real" definitions.
	- RegisterAggr DefRRs(PRI);
	- for (NodeId D : Defs) {
	- const auto DA = DFG.addr<const DefNode*>(D);
	- if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
	- DefRRs.insert(DA.Addr->getRegRef(DFG));
	- }
	-
	- NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs);
	- if (RDs.empty())
	- return { Defs, true };
	-
	- // Make a copy of the preexisting definitions and add the newly found ones.
	- NodeSet TmpDefs = Defs;
	- for (NodeAddr<NodeBase*> R : RDs)
	- TmpDefs.insert(R.Id);
	-
	- NodeSet Result = Defs;
	-
	- for (NodeAddr<DefNode*> DA : RDs) {
	- Result.insert(DA.Id);
	- if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
	- continue;
	- NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG);
	- if (Visited.count(PA.Id))
	- continue;
	- Visited.insert(PA.Id);
	- // Go over all phi uses and get the reaching defs for each use.
	- for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
	- const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
	- Nest+1, MaxNest);
	- if (!T.second)
	- return { T.first, false };
	- Result.insert(T.first.begin(), T.first.end());
	- }
	- }
	-
	- return { Result, true };
	-}
	-
	-/// Find the nearest ref node aliased to RefRR, going upwards in the data
	-/// flow, starting from the instruction immediately preceding Inst.
	-NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR,
	- NodeAddr<InstrNode*> IA) {
	- NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
	- NodeList Ins = BA.Addr->members(DFG);
	- NodeId FindId = IA.Id;
	- auto E = Ins.rend();
	- auto B = std::find_if(Ins.rbegin(), E,
	- [FindId] (const NodeAddr<InstrNode*> T) {
	- return T.Id == FindId;
	- });
	- // Do not scan IA (which is what B would point to).
	- if (B != E)
	- ++B;
	-
	- do {
	- // Process the range of instructions from B to E.
	- for (NodeAddr<InstrNode*> I : make_range(B, E)) {
	- NodeList Refs = I.Addr->members(DFG);
	- NodeAddr<RefNode*> Clob, Use;
	- // Scan all the refs in I aliased to RefRR, and return the one that
	- // is the closest to the output of I, i.e. def > clobber > use.
	- for (NodeAddr<RefNode*> R : Refs) {
	- if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR))
	- continue;
	- if (DFG.IsDef(R)) {
	- // If it's a non-clobbering def, just return it.
	- if (!(R.Addr->getFlags() & NodeAttrs::Clobbering))
	- return R;
	- Clob = R;
	- } else {
	- Use = R;
	- }
	- }
	- if (Clob.Id != 0)
	- return Clob;
	- if (Use.Id != 0)
	- return Use;
	- }
	-
	- // Go up to the immediate dominator, if any.
	- MachineBasicBlock *BB = BA.Addr->getCode();
	- BA = NodeAddr<BlockNode*>();
	- if (MachineDomTreeNode *N = MDT.getNode(BB)) {
	- if ((N = N->getIDom()))
	- BA = DFG.findBlock(N->getBlock());
	- }
	- if (!BA.Id)
	- break;
	-
	- Ins = BA.Addr->members(DFG);
	- B = Ins.rbegin();
	- E = Ins.rend();
	- } while (true);
	-
	- return NodeAddr<RefNode*>();
	-}
	-
	-NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
	- NodeAddr<DefNode*> DefA, const RegisterAggr &DefRRs) {
	- NodeSet Uses;
	-
	- // If the original register is already covered by all the intervening
	- // defs, no more uses can be reached.
	- if (DefRRs.hasCoverOf(RefRR))
	- return Uses;
	-
	- // Add all directly reached uses.
	- // If the def is dead, it does not provide a value for any use.
	- bool IsDead = DefA.Addr->getFlags() & NodeAttrs::Dead;
	- NodeId U = !IsDead ? DefA.Addr->getReachedUse() : 0;
	- while (U != 0) {
	- auto UA = DFG.addr<UseNode*>(U);
	- if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
	- RegisterRef UR = UA.Addr->getRegRef(DFG);
	- if (PRI.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
	- Uses.insert(U);
	- }
	- U = UA.Addr->getSibling();
	- }
	-
	- // Traverse all reached defs. This time dead defs cannot be ignored.
	- for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) {
	- auto DA = DFG.addr<DefNode*>(D);
	- NextD = DA.Addr->getSibling();
	- RegisterRef DR = DA.Addr->getRegRef(DFG);
	- // If this def is already covered, it cannot reach anything new.
	- // Similarly, skip it if it is not aliased to the interesting register.
	- if (DefRRs.hasCoverOf(DR) \|\| !PRI.alias(RefRR, DR))
	- continue;
	- NodeSet T;
	- if (DFG.IsPreservingDef(DA)) {
	- // If it is a preserving def, do not update the set of intervening defs.
	- T = getAllReachedUses(RefRR, DA, DefRRs);
	- } else {
	- RegisterAggr NewDefRRs = DefRRs;
	- NewDefRRs.insert(DR);
	- T = getAllReachedUses(RefRR, DA, NewDefRRs);
	- }
	- Uses.insert(T.begin(), T.end());
	- }
	- return Uses;
	-}
	-
	-void Liveness::computePhiInfo() {
	- RealUseMap.clear();
	-
	- NodeList Phis;
	- NodeAddr<FuncNode*> FA = DFG.getFunc();
	- NodeList Blocks = FA.Addr->members(DFG);
	- for (NodeAddr<BlockNode*> BA : Blocks) {
	- auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
	- Phis.insert(Phis.end(), Ps.begin(), Ps.end());
	- }
	-
	- // phi use -> (map: reaching phi -> set of registers defined in between)
	- std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
	- std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation.
	- std::map<NodeId,RegisterAggr> PhiDRs; // Phi -> registers defined by it.
	-
	- // Go over all phis.
	- for (NodeAddr<PhiNode*> PhiA : Phis) {
	- // Go over all defs and collect the reached uses that are non-phi uses
	- // (i.e. the "real uses").
	- RefMap &RealUses = RealUseMap[PhiA.Id];
	- NodeList PhiRefs = PhiA.Addr->members(DFG);
	-
	- // Have a work queue of defs whose reached uses need to be found.
	- // For each def, add to the queue all reached (non-phi) defs.
	- SetVector<NodeId> DefQ;
	- NodeSet PhiDefs;
	- RegisterAggr DRs(PRI);
	- for (NodeAddr<RefNode*> R : PhiRefs) {
	- if (!DFG.IsRef<NodeAttrs::Def>(R))
	- continue;
	- DRs.insert(R.Addr->getRegRef(DFG));
	- DefQ.insert(R.Id);
	- PhiDefs.insert(R.Id);
	- }
	- PhiDRs.insert(std::make_pair(PhiA.Id, DRs));
	-
	- // Collect the super-set of all possible reached uses. This set will
	- // contain all uses reached from this phi, either directly from the
	- // phi defs, or (recursively) via non-phi defs reached by the phi defs.
	- // This set of uses will later be trimmed to only contain these uses that
	- // are actually reached by the phi defs.
	- for (unsigned i = 0; i < DefQ.size(); ++i) {
	- NodeAddr<DefNode> DA = DFG.addr<DefNode>(DefQ[i]);
	- // Visit all reached uses. Phi defs should not really have the "dead"
	- // flag set, but check it anyway for consistency.
	- bool IsDead = DA.Addr->getFlags() & NodeAttrs::Dead;
	- NodeId UN = !IsDead ? DA.Addr->getReachedUse() : 0;
	- while (UN != 0) {
	- NodeAddr<UseNode> A = DFG.addr<UseNode>(UN);
	- uint16_t F = A.Addr->getFlags();
	- if ((F & (NodeAttrs::Undef \| NodeAttrs::PhiRef)) == 0) {
	- RegisterRef R = PRI.normalize(A.Addr->getRegRef(DFG));
	- RealUses[R.Reg].insert({A.Id,R.Mask});
	- }
	- UN = A.Addr->getSibling();
	- }
	- // Visit all reached defs, and add them to the queue. These defs may
	- // override some of the uses collected here, but that will be handled
	- // later.
	- NodeId DN = DA.Addr->getReachedDef();
	- while (DN != 0) {
	- NodeAddr<DefNode> A = DFG.addr<DefNode>(DN);
	- for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
	- uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags();
	- // Must traverse the reached-def chain. Consider:
	- // def(D0) -> def(R0) -> def(R0) -> use(D0)
	- // The reachable use of D0 passes through a def of R0.
	- if (!(Flags & NodeAttrs::PhiRef))
	- DefQ.insert(T.Id);
	- }
	- DN = A.Addr->getSibling();
	- }
	- }
	- // Filter out these uses that appear to be reachable, but really
	- // are not. For example:
	- //
	- // R1:0 = d1
	- // = R1:0 u2 Reached by d1.
	- // R0 = d3
	- // = R1:0 u4 Still reached by d1: indirectly through
	- // the def d3.
	- // R1 = d5
	- // = R1:0 u6 Not reached by d1 (covered collectively
	- // by d3 and d5), but following reached
	- // defs and uses from d1 will lead here.
	- for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
	- // For each reached register UI->first, there is a set UI->second, of
	- // uses of it. For each such use, check if it is reached by this phi,
	- // i.e. check if the set of its reaching uses intersects the set of
	- // this phi's defs.
	- NodeRefSet Uses = UI->second;
	- UI->second.clear();
	- for (std::pair<NodeId,LaneBitmask> I : Uses) {
	- auto UA = DFG.addr<UseNode*>(I.first);
	- // Undef flag is checked above.
	- assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
	- RegisterRef R(UI->first, I.second);
	- // Calculate the exposed part of the reached use.
	- RegisterAggr Covered(PRI);
	- for (NodeAddr<DefNode*> DA : getAllReachingDefs(R, UA)) {
	- if (PhiDefs.count(DA.Id))
	- break;
	- Covered.insert(DA.Addr->getRegRef(DFG));
	- }
	- if (RegisterRef RC = Covered.clearIn(R)) {
	- // We are updating the map for register UI->first, so we need
	- // to map RC to be expressed in terms of that register.
	- RegisterRef S = PRI.mapTo(RC, UI->first);
	- UI->second.insert({I.first, S.Mask});
	- }
	- }
	- UI = UI->second.empty() ? RealUses.erase(UI) : std::next(UI);
	- }
	-
	- // If this phi reaches some "real" uses, add it to the queue for upward
	- // propagation.
	- if (!RealUses.empty())
	- PhiUQ.push_back(PhiA.Id);
	-
	- // Go over all phi uses and check if the reaching def is another phi.
	- // Collect the phis that are among the reaching defs of these uses.
	- // While traversing the list of reaching defs for each phi use, accumulate
	- // the set of registers defined between this phi (PhiA) and the owner phi
	- // of the reaching def.
	- NodeSet SeenUses;
	-
	- for (auto I : PhiRefs) {
	- if (!DFG.IsRef<NodeAttrs::Use>(I) \|\| SeenUses.count(I.Id))
	- continue;
	- NodeAddr<PhiUseNode*> PUA = I;
	- if (PUA.Addr->getReachingDef() == 0)
	- continue;
	-
	- RegisterRef UR = PUA.Addr->getRegRef(DFG);
	- NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs);
	- RegisterAggr DefRRs(PRI);
	-
	- for (NodeAddr<DefNode*> D : Ds) {
	- if (D.Addr->getFlags() & NodeAttrs::PhiRef) {
	- NodeId RP = D.Addr->getOwner(DFG).Id;
	- std::map<NodeId,RegisterAggr> &M = PhiUp[PUA.Id];
	- auto F = M.find(RP);
	- if (F == M.end())
	- M.insert(std::make_pair(RP, DefRRs));
	- else
	- F->second.insert(DefRRs);
	- }
	- DefRRs.insert(D.Addr->getRegRef(DFG));
	- }
	-
	- for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PhiA, PUA))
	- SeenUses.insert(T.Id);
	- }
	- }
	-
	- if (Trace) {
	- dbgs() << "Phi-up-to-phi map with intervening defs:\n";
	- for (auto I : PhiUp) {
	- dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {";
	- for (auto R : I.second)
	- dbgs() << ' ' << Print<NodeId>(R.first, DFG)
	- << Print<RegisterAggr>(R.second, DFG);
	- dbgs() << " }\n";
	- }
	- }
	-
	- // Propagate the reached registers up in the phi chain.
	- //
	- // The following type of situation needs careful handling:
	- //
	- // phi d1<R1:0> (1)
	- // \|
	- // ... d2<R1>
	- // \|
	- // phi u3<R1:0> (2)
	- // \|
	- // ... u4<R1>
	- //
	- // The phi node (2) defines a register pair R1:0, and reaches a "real"
	- // use u4 of just R1. The same phi node is also known to reach (upwards)
	- // the phi node (1). However, the use u4 is not reached by phi (1),
	- // because of the intervening definition d2 of R1. The data flow between
	- // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
	- //
	- // When propagating uses up the phi chains, get the all reaching defs
	- // for a given phi use, and traverse the list until the propagated ref
	- // is covered, or until reaching the final phi. Only assume that the
	- // reference reaches the phi in the latter case.
	-
	- for (unsigned i = 0; i < PhiUQ.size(); ++i) {
	- auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
	- NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG);
	- RefMap &RUM = RealUseMap[PA.Id];
	-
	- for (NodeAddr<UseNode*> UA : PUs) {
	- std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
	- RegisterRef UR = PRI.normalize(UA.Addr->getRegRef(DFG));
	- for (const std::pair<const NodeId, RegisterAggr> &P : PUM) {
	- bool Changed = false;
	- const RegisterAggr &MidDefs = P.second;
	-
	- // Collect the set PropUp of uses that are reached by the current
	- // phi PA, and are not covered by any intervening def between the
	- // currently visited use UA and the upward phi P.
	-
	- if (MidDefs.hasCoverOf(UR))
	- continue;
	-
	- // General algorithm:
	- // for each (R,U) : U is use node of R, U is reached by PA
	- // if MidDefs does not cover (R,U)
	- // then add (R-MidDefs,U) to RealUseMap[P]
	- //
	- for (const std::pair<const RegisterId, NodeRefSet> &T : RUM) {
	- RegisterRef R(T.first);
	- // The current phi (PA) could be a phi for a regmask. It could
	- // reach a whole variety of uses that are not related to the
	- // specific upward phi (P.first).
	- const RegisterAggr &DRs = PhiDRs.at(P.first);
	- if (!DRs.hasAliasOf(R))
	- continue;
	- R = PRI.mapTo(DRs.intersectWith(R), T.first);
	- for (std::pair<NodeId,LaneBitmask> V : T.second) {
	- LaneBitmask M = R.Mask & V.second;
	- if (M.none())
	- continue;
	- if (RegisterRef SS = MidDefs.clearIn(RegisterRef(R.Reg, M))) {
	- NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
	- Changed \|= RS.insert({V.first,SS.Mask}).second;
	- }
	- }
	- }
	-
	- if (Changed)
	- PhiUQ.push_back(P.first);
	- }
	- }
	- }
	-
	- if (Trace) {
	- dbgs() << "Real use map:\n";
	- for (auto I : RealUseMap) {
	- dbgs() << "phi " << Print<NodeId>(I.first, DFG);
	- NodeAddr<PhiNode> PA = DFG.addr<PhiNode>(I.first);
	- NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
	- if (!Ds.empty()) {
	- RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(DFG);
	- dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>';
	- } else {
	- dbgs() << "<noreg>";
	- }
	- dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n';
	- }
	- }
	-}
	-
	-void Liveness::computeLiveIns() {
	- // Populate the node-to-block map. This speeds up the calculations
	- // significantly.
	- NBMap.clear();
	- for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
	- MachineBasicBlock *BB = BA.Addr->getCode();
	- for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
	- for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
	- NBMap.insert(std::make_pair(RA.Id, BB));
	- NBMap.insert(std::make_pair(IA.Id, BB));
	- }
	- }
	-
	- MachineFunction &MF = DFG.getMF();
	-
	- // Compute IDF first, then the inverse.
	- decltype(IIDF) IDF;
	- for (MachineBasicBlock &B : MF) {
	- auto F1 = MDF.find(&B);
	- if (F1 == MDF.end())
	- continue;
	- SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end());
	- for (unsigned i = 0; i < IDFB.size(); ++i) {
	- auto F2 = MDF.find(IDFB[i]);
	- if (F2 != MDF.end())
	- IDFB.insert(F2->second.begin(), F2->second.end());
	- }
	- // Add B to the IDF(B). This will put B in the IIDF(B).
	- IDFB.insert(&B);
	- IDF[&B].insert(IDFB.begin(), IDFB.end());
	- }
	-
	- for (auto I : IDF)
	- for (auto S : I.second)
	- IIDF[S].insert(I.first);
	-
	- computePhiInfo();
	-
	- NodeAddr<FuncNode*> FA = DFG.getFunc();
	- NodeList Blocks = FA.Addr->members(DFG);
	-
	- // Build the phi live-on-entry map.
	- for (NodeAddr<BlockNode*> BA : Blocks) {
	- MachineBasicBlock *MB = BA.Addr->getCode();
	- RefMap &LON = PhiLON[MB];
	- for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG))
	- for (const RefMap::value_type &S : RealUseMap[P.Id])
	- LON[S.first].insert(S.second.begin(), S.second.end());
	- }
	-
	- if (Trace) {
	- dbgs() << "Phi live-on-entry map:\n";
	- for (auto &I : PhiLON)
	- dbgs() << "block #" << I.first->getNumber() << " -> "
	- << Print<RefMap>(I.second, DFG) << '\n';
	- }
	-
	- // Build the phi live-on-exit map. Each phi node has some set of reached
	- // "real" uses. Propagate this set backwards into the block predecessors
	- // through the reaching defs of the corresponding phi uses.
	- for (NodeAddr<BlockNode*> BA : Blocks) {
	- NodeList Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
	- for (NodeAddr<PhiNode*> PA : Phis) {
	- RefMap &RUs = RealUseMap[PA.Id];
	- if (RUs.empty())
	- continue;
	-
	- NodeSet SeenUses;
	- for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
	- if (!SeenUses.insert(U.Id).second)
	- continue;
	- NodeAddr<PhiUseNode*> PUA = U;
	- if (PUA.Addr->getReachingDef() == 0)
	- continue;
	-
	- // Each phi has some set (possibly empty) of reached "real" uses,
	- // that is, uses that are part of the compiled program. Such a use
	- // may be located in some farther block, but following a chain of
	- // reaching defs will eventually lead to this phi.
	- // Any chain of reaching defs may fork at a phi node, but there
	- // will be a path upwards that will lead to this phi. Now, this
	- // chain will need to fork at this phi, since some of the reached
	- // uses may have definitions joining in from multiple predecessors.
	- // For each reached "real" use, identify the set of reaching defs
	- // coming from each predecessor P, and add them to PhiLOX[P].
	- //
	- auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
	- RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
	-
	- for (const std::pair<const RegisterId, NodeRefSet> &RS : RUs) {
	- // We need to visit each individual use.
	- for (std::pair<NodeId,LaneBitmask> P : RS.second) {
	- // Create a register ref corresponding to the use, and find
	- // all reaching defs starting from the phi use, and treating
	- // all related shadows as a single use cluster.
	- RegisterRef S(RS.first, P.second);
	- NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs);
	- for (NodeAddr<DefNode*> D : Ds) {
	- // Calculate the mask corresponding to the visited def.
	- RegisterAggr TA(PRI);
	- TA.insert(D.Addr->getRegRef(DFG)).intersect(S);
	- LaneBitmask TM = TA.makeRegRef().Mask;
	- LOX[S.Reg].insert({D.Id, TM});
	- }
	- }
	- }
	-
	- for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PA, PUA))
	- SeenUses.insert(T.Id);
	- } // for U : phi uses
	- } // for P : Phis
	- } // for B : Blocks
	-
	- if (Trace) {
	- dbgs() << "Phi live-on-exit map:\n";
	- for (auto &I : PhiLOX)
	- dbgs() << "block #" << I.first->getNumber() << " -> "
	- << Print<RefMap>(I.second, DFG) << '\n';
	- }
	-
	- RefMap LiveIn;
	- traverse(&MF.front(), LiveIn);
	-
	- // Add function live-ins to the live-in set of the function entry block.
	- LiveMap[&MF.front()].insert(DFG.getLiveIns());
	-
	- if (Trace) {
	- // Dump the liveness map
	- for (MachineBasicBlock &B : MF) {
	- std::vector<RegisterRef> LV;
	- for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
	- LV.push_back(RegisterRef(I->PhysReg, I->LaneMask));
	- llvm::sort(LV);
	- dbgs() << printMBBReference(B) << "\t rec = {";
	- for (auto I : LV)
	- dbgs() << ' ' << Print<RegisterRef>(I, DFG);
	- dbgs() << " }\n";
	- //dbgs() << "\tcomp = " << Print<RegisterAggr>(LiveMap[&B], DFG) << '\n';
	-
	- LV.clear();
	- const RegisterAggr &LG = LiveMap[&B];
	- for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I)
	- LV.push_back(*I);
	- llvm::sort(LV);
	- dbgs() << "\tcomp = {";
	- for (auto I : LV)
	- dbgs() << ' ' << Print<RegisterRef>(I, DFG);
	- dbgs() << " }\n";
	-
	- }
	- }
	-}
	-
	-void Liveness::resetLiveIns() {
	- for (auto &B : DFG.getMF()) {
	- // Remove all live-ins.
	- std::vector<unsigned> T;
	- for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
	- T.push_back(I->PhysReg);
	- for (auto I : T)
	- B.removeLiveIn(I);
	- // Add the newly computed live-ins.
	- const RegisterAggr &LiveIns = LiveMap[&B];
	- for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
	- RegisterRef R = *I;
	- B.addLiveIn({MCPhysReg(R.Reg), R.Mask});
	- }
	- }
	-}
	-
	-void Liveness::resetKills() {
	- for (auto &B : DFG.getMF())
	- resetKills(&B);
	-}
	-
	-void Liveness::resetKills(MachineBasicBlock *B) {
	- auto CopyLiveIns = [this] (MachineBasicBlock *B, BitVector &LV) -> void {
	- for (auto I : B->liveins()) {
	- MCSubRegIndexIterator S(I.PhysReg, &TRI);
	- if (!S.isValid()) {
	- LV.set(I.PhysReg);
	- continue;
	- }
	- do {
	- LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
	- if ((M & I.LaneMask).any())
	- LV.set(S.getSubReg());
	- ++S;
	- } while (S.isValid());
	- }
	- };
	-
	- BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
	- CopyLiveIns(B, LiveIn);
	- for (auto SI : B->successors())
	- CopyLiveIns(SI, Live);
	-
	- for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
	- MachineInstr MI = &I;
	- if (MI->isDebugInstr())
	- continue;
	-
	- MI->clearKillInfo();
	- for (auto &Op : MI->operands()) {
	- // An implicit def of a super-register may not necessarily start a
	- // live range of it, since an implicit use could be used to keep parts
	- // of it live. Instead of analyzing the implicit operands, ignore
	- // implicit defs.
	- if (!Op.isReg() \|\| !Op.isDef() \|\| Op.isImplicit())
	- continue;
	- Register R = Op.getReg();
	- if (!Register::isPhysicalRegister(R))
	- continue;
	- for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
	- Live.reset(*SR);
	- }
	- for (auto &Op : MI->operands()) {
	- if (!Op.isReg() \|\| !Op.isUse() \|\| Op.isUndef())
	- continue;
	- Register R = Op.getReg();
	- if (!Register::isPhysicalRegister(R))
	- continue;
	- bool IsLive = false;
	- for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
	- if (!Live[*AR])
	- continue;
	- IsLive = true;
	- break;
	- }
	- if (!IsLive)
	- Op.setIsKill(true);
	- for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
	- Live.set(*SR);
	- }
	- }
	-}
	-
	-// Helper function to obtain the basic block containing the reaching def
	-// of the given use.
	-MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
	- auto F = NBMap.find(RN);
	- if (F != NBMap.end())
	- return F->second;
	- llvm_unreachable("Node id not in map");
	-}
	-
	-void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
	- // The LiveIn map, for each (physical) register, contains the set of live
	- // reaching defs of that register that are live on entry to the associated
	- // block.
	-
	- // The summary of the traversal algorithm:
	- //
	- // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
	- // and (U \in IDF(B) or B dom U).
	- //
	- // for (C : children) {
	- // LU = {}
	- // traverse(C, LU)
	- // LiveUses += LU
	- // }
	- //
	- // LiveUses -= Defs(B);
	- // LiveUses += UpwardExposedUses(B);
	- // for (C : IIDF[B])
	- // for (U : LiveUses)
	- // if (Rdef(U) dom C)
	- // C.addLiveIn(U)
	- //
	-
	- // Go up the dominator tree (depth-first).
	- MachineDomTreeNode *N = MDT.getNode(B);
	- for (auto I : *N) {
	- RefMap L;
	- MachineBasicBlock *SB = I->getBlock();
	- traverse(SB, L);
	-
	- for (auto S : L)
	- LiveIn[S.first].insert(S.second.begin(), S.second.end());
	- }
	-
	- if (Trace) {
	- dbgs() << "\n-- " << printMBBReference(*B) << ": " << __func__
	- << " after recursion into: {";
	- for (auto I : *N)
	- dbgs() << ' ' << I->getBlock()->getNumber();
	- dbgs() << " }\n";
	- dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	- dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	- }
	-
	- // Add reaching defs of phi uses that are live on exit from this block.
	- RefMap &PUs = PhiLOX[B];
	- for (auto &S : PUs)
	- LiveIn[S.first].insert(S.second.begin(), S.second.end());
	-
	- if (Trace) {
	- dbgs() << "after LOX\n";
	- dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	- dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	- }
	-
	- // The LiveIn map at this point has all defs that are live-on-exit from B,
	- // as if they were live-on-entry to B. First, we need to filter out all
	- // defs that are present in this block. Then we will add reaching defs of
	- // all upward-exposed uses.
	-
	- // To filter out the defs, first make a copy of LiveIn, and then re-populate
	- // LiveIn with the defs that should remain.
	- RefMap LiveInCopy = LiveIn;
	- LiveIn.clear();
	-
	- for (const std::pair<const RegisterId, NodeRefSet> &LE : LiveInCopy) {
	- RegisterRef LRef(LE.first);
	- NodeRefSet &NewDefs = LiveIn[LRef.Reg]; // To be filled.
	- const NodeRefSet &OldDefs = LE.second;
	- for (NodeRef OR : OldDefs) {
	- // R is a def node that was live-on-exit
	- auto DA = DFG.addr<DefNode*>(OR.first);
	- NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
	- NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
	- if (B != BA.Addr->getCode()) {
	- // Defs from a different block need to be preserved. Defs from this
	- // block will need to be processed further, except for phi defs, the
	- // liveness of which is handled through the PhiLON/PhiLOX maps.
	- NewDefs.insert(OR);
	- continue;
	- }
	-
	- // Defs from this block need to stop the liveness from being
	- // propagated upwards. This only applies to non-preserving defs,
	- // and to the parts of the register actually covered by those defs.
	- // (Note that phi defs should always be preserving.)
	- RegisterAggr RRs(PRI);
	- LRef.Mask = OR.second;
	-
	- if (!DFG.IsPreservingDef(DA)) {
	- assert(!(IA.Addr->getFlags() & NodeAttrs::Phi));
	- // DA is a non-phi def that is live-on-exit from this block, and
	- // that is also located in this block. LRef is a register ref
	- // whose use this def reaches. If DA covers LRef, then no part
	- // of LRef is exposed upwards.A
	- if (RRs.insert(DA.Addr->getRegRef(DFG)).hasCoverOf(LRef))
	- continue;
	- }
	-
	- // DA itself was not sufficient to cover LRef. In general, it is
	- // the last in a chain of aliased defs before the exit from this block.
	- // There could be other defs in this block that are a part of that
	- // chain. Check that now: accumulate the registers from these defs,
	- // and if they all together cover LRef, it is not live-on-entry.
	- for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) {
	- // DefNode -> InstrNode -> BlockNode.
	- NodeAddr<InstrNode*> ITA = TA.Addr->getOwner(DFG);
	- NodeAddr<BlockNode*> BTA = ITA.Addr->getOwner(DFG);
	- // Reaching defs are ordered in the upward direction.
	- if (BTA.Addr->getCode() != B) {
	- // We have reached past the beginning of B, and the accumulated
	- // registers are not covering LRef. The first def from the
	- // upward chain will be live.
	- // Subtract all accumulated defs (RRs) from LRef.
	- RegisterRef T = RRs.clearIn(LRef);
	- assert(T);
	- NewDefs.insert({TA.Id,T.Mask});
	- break;
	- }
	-
	- // TA is in B. Only add this def to the accumulated cover if it is
	- // not preserving.
	- if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
	- RRs.insert(TA.Addr->getRegRef(DFG));
	- // If this is enough to cover LRef, then stop.
	- if (RRs.hasCoverOf(LRef))
	- break;
	- }
	- }
	- }
	-
	- emptify(LiveIn);
	-
	- if (Trace) {
	- dbgs() << "after defs in block\n";
	- dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	- dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	- }
	-
	- // Scan the block for upward-exposed uses and add them to the tracking set.
	- for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
	- NodeAddr<InstrNode*> IA = I;
	- if (IA.Addr->getKind() != NodeAttrs::Stmt)
	- continue;
	- for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
	- if (UA.Addr->getFlags() & NodeAttrs::Undef)
	- continue;
	- RegisterRef RR = PRI.normalize(UA.Addr->getRegRef(DFG));
	- for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
	- if (getBlockWithRef(D.Id) != B)
	- LiveIn[RR.Reg].insert({D.Id,RR.Mask});
	- }
	- }
	-
	- if (Trace) {
	- dbgs() << "after uses in block\n";
	- dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	- dbgs() << " Local: " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
	- }
	-
	- // Phi uses should not be propagated up the dominator tree, since they
	- // are not dominated by their corresponding reaching defs.
	- RegisterAggr &Local = LiveMap[B];
	- RefMap &LON = PhiLON[B];
	- for (auto &R : LON) {
	- LaneBitmask M;
	- for (auto P : R.second)
	- M \|= P.second;
	- Local.insert(RegisterRef(R.first,M));
	- }
	-
	- if (Trace) {
	- dbgs() << "after phi uses in block\n";
	- dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
	- dbgs() << " Local: " << Print<RegisterAggr>(Local, DFG) << '\n';
	- }
	-
	- for (auto C : IIDF[B]) {
	- RegisterAggr &LiveC = LiveMap[C];
	- for (const std::pair<const RegisterId, NodeRefSet> &S : LiveIn)
	- for (auto R : S.second)
	- if (MDT.properlyDominates(getBlockWithRef(R.first), C))
	- LiveC.insert(RegisterRef(S.first, R.second));
	- }
	-}
	-
	-void Liveness::emptify(RefMap &M) {
	- for (auto I = M.begin(), E = M.end(); I != E; )
	- I = I->second.empty() ? M.erase(I) : std::next(I);
	-}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFLiveness.cpp
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp (revision 362609)
	@@ -1,828 +1,828 @@
	//===- HexagonOptAddrMode.cpp ---------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	// This implements a Hexagon-specific pass to optimize addressing mode for
	// load/store instructions.
	//===----------------------------------------------------------------------===//

	#include "HexagonInstrInfo.h"
	#include "HexagonSubtarget.h"
	#include "MCTargetDesc/HexagonBaseInfo.h"
	-#include "RDFGraph.h"
	-#include "RDFLiveness.h"
	-#include "RDFRegisters.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineDominanceFrontier.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cstdint>

	#define DEBUG_TYPE "opt-addr-mode"

	using namespace llvm;
	using namespace rdf;

	static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
	cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
	"optimization"));

	namespace llvm {

	FunctionPass *createHexagonOptAddrMode();
	void initializeHexagonOptAddrModePass(PassRegistry&);

	} // end namespace llvm

	namespace {

	class HexagonOptAddrMode : public MachineFunctionPass {
	public:
	static char ID;

	HexagonOptAddrMode() : MachineFunctionPass(ID) {}

	StringRef getPassName() const override {
	return "Optimize addressing mode of load/store";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	MachineFunctionPass::getAnalysisUsage(AU);
	AU.addRequired<MachineDominatorTree>();
	AU.addRequired<MachineDominanceFrontier>();
	AU.setPreservesAll();
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	private:
	using MISetType = DenseSet<MachineInstr *>;
	using InstrEvalMap = DenseMap<MachineInstr *, bool>;

	MachineRegisterInfo *MRI = nullptr;
	const HexagonInstrInfo *HII = nullptr;
	const HexagonRegisterInfo *HRI = nullptr;
	MachineDominatorTree *MDT = nullptr;
	DataFlowGraph *DFG = nullptr;
	DataFlowGraph::DefStackMap DefM;
	Liveness *LV = nullptr;
	MISetType Deleted;

	bool processBlock(NodeAddr<BlockNode *> BA);
	bool xformUseMI(MachineInstr TfrMI, MachineInstr UseMI,
	NodeAddr<UseNode *> UseN, unsigned UseMOnum);
	bool processAddUses(NodeAddr<StmtNode > AddSN, MachineInstr AddMI,
	const NodeList &UNodeList);
	bool updateAddUses(MachineInstr AddMI, MachineInstr UseMI);
	bool analyzeUses(unsigned DefR, const NodeList &UNodeList,
	InstrEvalMap &InstrEvalResult, short &SizeInc);
	bool hasRepForm(MachineInstr &MI, unsigned TfrDefR);
	bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr &MI,
	const NodeList &UNodeList);
	bool isSafeToExtLR(NodeAddr<StmtNode > SN, MachineInstr MI,
	unsigned LRExtReg, const NodeList &UNodeList);
	void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
	bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
	short getBaseWithLongOffset(const MachineInstr &MI) const;
	bool changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
	unsigned ImmOpNum);
	bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
	bool changeAddAsl(NodeAddr<UseNode > AddAslUN, MachineInstr AddAslMI,
	const MachineOperand &ImmOp, unsigned ImmOpNum);
	bool isValidOffset(MachineInstr *MI, int Offset);
	};

	} // end anonymous namespace

	char HexagonOptAddrMode::ID = 0;

	INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt",
	"Optimize addressing mode", false, false)
	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
	INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
	INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode",
	false, false)

	bool HexagonOptAddrMode::hasRepForm(MachineInstr &MI, unsigned TfrDefR) {
	const MCInstrDesc &MID = MI.getDesc();

	if ((!MID.mayStore() && !MID.mayLoad()) \|\| HII->isPredicated(MI))
	return false;

	if (MID.mayStore()) {
	MachineOperand StOp = MI.getOperand(MI.getNumOperands() - 1);
	if (StOp.isReg() && StOp.getReg() == TfrDefR)
	return false;
	}

	if (HII->getAddrMode(MI) == HexagonII::BaseRegOffset)
	// Tranform to Absolute plus register offset.
	return (HII->changeAddrMode_rr_ur(MI) >= 0);
	else if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset)
	// Tranform to absolute addressing mode.
	return (HII->changeAddrMode_io_abs(MI) >= 0);

	return false;
	}

	// Check if addasl instruction can be removed. This is possible only
	// if it's feeding to only load/store instructions with base + register
	// offset as these instruction can be tranformed to use 'absolute plus
	// shifted register offset'.
	// ex:
	// Rs = ##foo
	// Rx = addasl(Rs, Rt, #2)
	// Rd = memw(Rx + #28)
	// Above three instructions can be replaced with Rd = memw(Rt<<#2 + ##foo+28)

	bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
	MachineInstr &MI,
	const NodeList &UNodeList) {
	// check offset size in addasl. if 'offset > 3' return false
	const MachineOperand &OffsetOp = MI.getOperand(3);
	if (!OffsetOp.isImm() \|\| OffsetOp.getImm() > 3)
	return false;

	Register OffsetReg = MI.getOperand(2).getReg();
	RegisterRef OffsetRR;
	NodeId OffsetRegRD = 0;
	for (NodeAddr<UseNode > UA : AddAslSN.Addr->members_if(DFG->IsUse, DFG)) {
	RegisterRef RR = UA.Addr->getRegRef(*DFG);
	if (OffsetReg == RR.Reg) {
	OffsetRR = RR;
	OffsetRegRD = UA.Addr->getReachingDef();
	}
	}

	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UA = I;
	NodeAddr<InstrNode > IA = UA.Addr->getOwner(DFG);
	if (UA.Addr->getFlags() & NodeAttrs::PhiRef)
	return false;
	NodeAddr<RefNode*> AA = LV->getNearestAliasedRef(OffsetRR, IA);
	if ((DFG->IsDef(AA) && AA.Id != OffsetRegRD) \|\|
	AA.Addr->getReachingDef() != OffsetRegRD)
	return false;

	MachineInstr &UseMI = NodeAddr<StmtNode >(IA).Addr->getCode();
	NodeAddr<DefNode > OffsetRegDN = DFG->addr<DefNode >(OffsetRegRD);
	// Reaching Def to an offset register can't be a phi.
	if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
	MI.getParent() != UseMI.getParent())
	return false;

	const MCInstrDesc &UseMID = UseMI.getDesc();
	if ((!UseMID.mayLoad() && !UseMID.mayStore()) \|\|
	HII->getAddrMode(UseMI) != HexagonII::BaseImmOffset \|\|
	getBaseWithLongOffset(UseMI) < 0)
	return false;

	// Addasl output can't be a store value.
	if (UseMID.mayStore() && UseMI.getOperand(2).isReg() &&
	UseMI.getOperand(2).getReg() == MI.getOperand(0).getReg())
	return false;

	for (auto &Mo : UseMI.operands())
	if (Mo.isFI())
	return false;
	}
	return true;
	}

	bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
	NodeList &UNodeList) {
	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UN = I;
	RegisterRef UR = UN.Addr->getRegRef(*DFG);
	NodeSet Visited, Defs;
	const auto &P = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
	if (!P.second) {
	LLVM_DEBUG({
	dbgs() << "* Unable to collect all reaching defs for use *\n"
	<< PrintNode<UseNode>(UN, DFG) << '\n'
	<< "The program's complexity may exceed the limits.\n";
	});
	return false;
	}
	const auto &ReachingDefs = P.first;
	if (ReachingDefs.size() > 1) {
	LLVM_DEBUG({
	dbgs() << "* Multiple Reaching Defs found!!! *\n";
	for (auto DI : ReachingDefs) {
	NodeAddr<UseNode > DA = DFG->addr<UseNode >(DI);
	NodeAddr<StmtNode > TempIA = DA.Addr->getOwner(DFG);
	dbgs() << "\t\t[Reaching Def]: "
	<< Print<NodeAddr<InstrNode >>(TempIA, DFG) << "\n";
	}
	});
	return false;
	}
	}
	return true;
	}

	void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
	NodeList &UNodeList) {
	for (NodeAddr<DefNode > DA : SA.Addr->members_if(DFG->IsDef, DFG)) {
	LLVM_DEBUG(dbgs() << "\t\t[DefNode]: "
	<< Print<NodeAddr<DefNode >>(DA, DFG) << "\n");
	RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));

	auto UseSet = LV->getAllReachedUses(DR, DA);

	for (auto UI : UseSet) {
	NodeAddr<UseNode > UA = DFG->addr<UseNode >(UI);
	LLVM_DEBUG({
	NodeAddr<StmtNode > TempIA = UA.Addr->getOwner(DFG);
	dbgs() << "\t\t\t[Reached Use]: "
	<< Print<NodeAddr<InstrNode >>(TempIA, DFG) << "\n";
	});

	if (UA.Addr->getFlags() & NodeAttrs::PhiRef) {
	NodeAddr<PhiNode > PA = UA.Addr->getOwner(DFG);
	NodeId id = PA.Id;
	const Liveness::RefMap &phiUse = LV->getRealUses(id);
	LLVM_DEBUG(dbgs() << "\t\t\t\tphi real Uses"
	<< Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
	if (!phiUse.empty()) {
	for (auto I : phiUse) {
	if (!DFG->getPRI().alias(RegisterRef(I.first), DR))
	continue;
	auto phiUseSet = I.second;
	for (auto phiUI : phiUseSet) {
	NodeAddr<UseNode > phiUA = DFG->addr<UseNode >(phiUI.first);
	UNodeList.push_back(phiUA);
	}
	}
	}
	} else
	UNodeList.push_back(UA);
	}
	}
	}

	bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
	MachineInstr *MI, unsigned LRExtReg,
	const NodeList &UNodeList) {
	RegisterRef LRExtRR;
	NodeId LRExtRegRD = 0;
	// Iterate through all the UseNodes in SN and find the reaching def
	// for the LRExtReg.
	for (NodeAddr<UseNode > UA : SN.Addr->members_if(DFG->IsUse, DFG)) {
	RegisterRef RR = UA.Addr->getRegRef(*DFG);
	if (LRExtReg == RR.Reg) {
	LRExtRR = RR;
	LRExtRegRD = UA.Addr->getReachingDef();
	}
	}

	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UA = I;
	NodeAddr<InstrNode > IA = UA.Addr->getOwner(DFG);
	// The reaching def of LRExtRR at load/store node should be same as the
	// one reaching at the SN.
	if (UA.Addr->getFlags() & NodeAttrs::PhiRef)
	return false;
	NodeAddr<RefNode*> AA = LV->getNearestAliasedRef(LRExtRR, IA);
	if ((DFG->IsDef(AA) && AA.Id != LRExtRegRD) \|\|
	AA.Addr->getReachingDef() != LRExtRegRD) {
	LLVM_DEBUG(
	dbgs() << "isSafeToExtLR: Returning false; another reaching def\n");
	return false;
	}

	MachineInstr UseMI = NodeAddr<StmtNode >(IA).Addr->getCode();
	NodeAddr<DefNode > LRExtRegDN = DFG->addr<DefNode >(LRExtRegRD);
	// Reaching Def to LRExtReg can't be a phi.
	if ((LRExtRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
	MI->getParent() != UseMI->getParent())
	return false;
	}
	return true;
	}

	bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) {
	unsigned AlignMask = 0;
	switch (HII->getMemAccessSize(*MI)) {
	case HexagonII::MemAccessSize::DoubleWordAccess:
	AlignMask = 0x7;
	break;
	case HexagonII::MemAccessSize::WordAccess:
	AlignMask = 0x3;
	break;
	case HexagonII::MemAccessSize::HalfWordAccess:
	AlignMask = 0x1;
	break;
	case HexagonII::MemAccessSize::ByteAccess:
	AlignMask = 0x0;
	break;
	default:
	return false;
	}

	if ((AlignMask & Offset) != 0)
	return false;
	return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false);
	}

	bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
	MachineInstr *AddMI,
	const NodeList &UNodeList) {

	Register AddDefR = AddMI->getOperand(0).getReg();
	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UN = I;
	NodeAddr<StmtNode > SN = UN.Addr->getOwner(DFG);
	MachineInstr *MI = SN.Addr->getCode();
	const MCInstrDesc &MID = MI->getDesc();
	if ((!MID.mayLoad() && !MID.mayStore()) \|\|
	HII->getAddrMode(*MI) != HexagonII::BaseImmOffset \|\|
	HII->isHVXVec(*MI))
	return false;

	MachineOperand BaseOp = MID.mayLoad() ? MI->getOperand(1)
	: MI->getOperand(0);

	if (!BaseOp.isReg() \|\| BaseOp.getReg() != AddDefR)
	return false;

	MachineOperand OffsetOp = MID.mayLoad() ? MI->getOperand(2)
	: MI->getOperand(1);
	if (!OffsetOp.isImm())
	return false;

	int64_t newOffset = OffsetOp.getImm() + AddMI->getOperand(2).getImm();
	if (!isValidOffset(MI, newOffset))
	return false;

	// Since we'll be extending the live range of Rt in the following example,
	// make sure that is safe. another definition of Rt doesn't exist between 'add'
	// and load/store instruction.
	//
	// Ex: Rx= add(Rt,#10)
	// memw(Rx+#0) = Rs
	// will be replaced with => memw(Rt+#10) = Rs
	Register BaseReg = AddMI->getOperand(1).getReg();
	if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList))
	return false;
	}

	// Update all the uses of 'add' with the appropriate base and offset
	// values.
	bool Changed = false;
	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UseN = I;
	assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
	"Found a PhiRef node as a real reached use!!");

	NodeAddr<StmtNode > OwnerN = UseN.Addr->getOwner(DFG);
	MachineInstr *UseMI = OwnerN.Addr->getCode();
	LLVM_DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
	<< ">]: " << *UseMI << "\n");
	Changed \|= updateAddUses(AddMI, UseMI);
	}

	if (Changed)
	Deleted.insert(AddMI);

	return Changed;
	}

	bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI,
	MachineInstr *UseMI) {
	const MachineOperand ImmOp = AddMI->getOperand(2);
	const MachineOperand AddRegOp = AddMI->getOperand(1);
	Register newReg = AddRegOp.getReg();
	const MCInstrDesc &MID = UseMI->getDesc();

	MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1)
	: UseMI->getOperand(0);
	MachineOperand &OffsetOp = MID.mayLoad() ? UseMI->getOperand(2)
	: UseMI->getOperand(1);
	BaseOp.setReg(newReg);
	BaseOp.setIsUndef(AddRegOp.isUndef());
	BaseOp.setImplicit(AddRegOp.isImplicit());
	OffsetOp.setImm(ImmOp.getImm() + OffsetOp.getImm());
	MRI->clearKillFlags(newReg);

	return true;
	}

	bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
	const NodeList &UNodeList,
	InstrEvalMap &InstrEvalResult,
	short &SizeInc) {
	bool KeepTfr = false;
	bool HasRepInstr = false;
	InstrEvalResult.clear();

	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	bool CanBeReplaced = false;
	NodeAddr<UseNode > UN = I;
	NodeAddr<StmtNode > SN = UN.Addr->getOwner(DFG);
	MachineInstr &MI = *SN.Addr->getCode();
	const MCInstrDesc &MID = MI.getDesc();
	if ((MID.mayLoad() \|\| MID.mayStore())) {
	if (!hasRepForm(MI, tfrDefR)) {
	KeepTfr = true;
	continue;
	}
	SizeInc++;
	CanBeReplaced = true;
	} else if (MI.getOpcode() == Hexagon::S2_addasl_rrri) {
	NodeList AddaslUseList;

	LLVM_DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
	getAllRealUses(SN, AddaslUseList);
	// Process phi nodes.
	if (allValidCandidates(SN, AddaslUseList) &&
	canRemoveAddasl(SN, MI, AddaslUseList)) {
	SizeInc += AddaslUseList.size();
	SizeInc -= 1; // Reduce size by 1 as addasl itself can be removed.
	CanBeReplaced = true;
	} else
	SizeInc++;
	} else
	// Currently, only load/store and addasl are handled.
	// Some other instructions to consider -
	// A2_add -> A2_addi
	// M4_mpyrr_addr -> M4_mpyrr_addi
	KeepTfr = true;

	InstrEvalResult[&MI] = CanBeReplaced;
	HasRepInstr \|= CanBeReplaced;
	}

	// Reduce total size by 2 if original tfr can be deleted.
	if (!KeepTfr)
	SizeInc -= 2;

	return HasRepInstr;
	}

	bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
	unsigned ImmOpNum) {
	bool Changed = false;
	MachineBasicBlock *BB = OldMI->getParent();
	auto UsePos = MachineBasicBlock::iterator(OldMI);
	MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
	++InsertPt;
	unsigned OpStart;
	unsigned OpEnd = OldMI->getNumOperands();
	MachineInstrBuilder MIB;

	if (ImmOpNum == 1) {
	if (HII->getAddrMode(*OldMI) == HexagonII::BaseRegOffset) {
	short NewOpCode = HII->changeAddrMode_rr_ur(*OldMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");
	MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
	MIB.add(OldMI->getOperand(0));
	MIB.add(OldMI->getOperand(2));
	MIB.add(OldMI->getOperand(3));
	MIB.add(ImmOp);
	OpStart = 4;
	Changed = true;
	} else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset &&
	OldMI->getOperand(2).isImm()) {
	short NewOpCode = HII->changeAddrMode_io_abs(*OldMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");
	MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
	.add(OldMI->getOperand(0));
	const GlobalValue *GV = ImmOp.getGlobal();
	int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(2).getImm();

	MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
	OpStart = 3;
	Changed = true;
	} else
	Changed = false;

	LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
	LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
	} else if (ImmOpNum == 2) {
	if (OldMI->getOperand(3).isImm() && OldMI->getOperand(3).getImm() == 0) {
	short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");
	MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
	MIB.add(OldMI->getOperand(0));
	MIB.add(OldMI->getOperand(1));
	MIB.add(ImmOp);
	OpStart = 4;
	Changed = true;
	LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
	LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
	}
	}

	if (Changed)
	for (unsigned i = OpStart; i < OpEnd; ++i)
	MIB.add(OldMI->getOperand(i));

	return Changed;
	}

	bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
	unsigned ImmOpNum) {
	bool Changed = false;
	unsigned OpStart = 0;
	unsigned OpEnd = OldMI->getNumOperands();
	MachineBasicBlock *BB = OldMI->getParent();
	auto UsePos = MachineBasicBlock::iterator(OldMI);
	MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
	++InsertPt;
	MachineInstrBuilder MIB;
	if (ImmOpNum == 0) {
	if (HII->getAddrMode(*OldMI) == HexagonII::BaseRegOffset) {
	short NewOpCode = HII->changeAddrMode_rr_ur(*OldMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");
	MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
	MIB.add(OldMI->getOperand(1));
	MIB.add(OldMI->getOperand(2));
	MIB.add(ImmOp);
	MIB.add(OldMI->getOperand(3));
	OpStart = 4;
	} else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
	short NewOpCode = HII->changeAddrMode_io_abs(*OldMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");
	MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
	const GlobalValue *GV = ImmOp.getGlobal();
	int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(1).getImm();
	MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
	MIB.add(OldMI->getOperand(2));
	OpStart = 3;
	}
	Changed = true;
	LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
	LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
	} else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
	short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");
	MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
	MIB.add(OldMI->getOperand(0));
	MIB.add(ImmOp);
	OpStart = 3;
	Changed = true;
	LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
	LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
	}
	if (Changed)
	for (unsigned i = OpStart; i < OpEnd; ++i)
	MIB.add(OldMI->getOperand(i));

	return Changed;
	}

	short HexagonOptAddrMode::getBaseWithLongOffset(const MachineInstr &MI) const {
	if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset) {
	short TempOpCode = HII->changeAddrMode_io_rr(MI);
	return HII->changeAddrMode_rr_ur(TempOpCode);
	}
	return HII->changeAddrMode_rr_ur(MI);
	}

	bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
	MachineInstr *AddAslMI,
	const MachineOperand &ImmOp,
	unsigned ImmOpNum) {
	NodeAddr<StmtNode > SA = AddAslUN.Addr->getOwner(DFG);

	LLVM_DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");

	NodeList UNodeList;
	getAllRealUses(SA, UNodeList);

	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UseUN = I;
	assert(!(UseUN.Addr->getFlags() & NodeAttrs::PhiRef) &&
	"Can't transform this 'AddAsl' instruction!");

	NodeAddr<StmtNode > UseIA = UseUN.Addr->getOwner(DFG);
	LLVM_DEBUG(dbgs() << "[InstrNode]: "
	<< Print<NodeAddr<InstrNode >>(UseIA, DFG) << "\n");
	MachineInstr *UseMI = UseIA.Addr->getCode();
	LLVM_DEBUG(dbgs() << "[MI <" << printMBBReference(*UseMI->getParent())
	<< ">]: " << *UseMI << "\n");
	const MCInstrDesc &UseMID = UseMI->getDesc();
	assert(HII->getAddrMode(*UseMI) == HexagonII::BaseImmOffset);

	auto UsePos = MachineBasicBlock::iterator(UseMI);
	MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
	short NewOpCode = getBaseWithLongOffset(*UseMI);
	assert(NewOpCode >= 0 && "Invalid New opcode\n");

	unsigned OpStart;
	unsigned OpEnd = UseMI->getNumOperands();

	MachineBasicBlock *BB = UseMI->getParent();
	MachineInstrBuilder MIB =
	BuildMI(*BB, InsertPt, UseMI->getDebugLoc(), HII->get(NewOpCode));
	// change mem(Rs + # ) -> mem(Rt << # + ##)
	if (UseMID.mayLoad()) {
	MIB.add(UseMI->getOperand(0));
	MIB.add(AddAslMI->getOperand(2));
	MIB.add(AddAslMI->getOperand(3));
	const GlobalValue *GV = ImmOp.getGlobal();
	MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm()+ImmOp.getOffset(),
	ImmOp.getTargetFlags());
	OpStart = 3;
	} else if (UseMID.mayStore()) {
	MIB.add(AddAslMI->getOperand(2));
	MIB.add(AddAslMI->getOperand(3));
	const GlobalValue *GV = ImmOp.getGlobal();
	MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm()+ImmOp.getOffset(),
	ImmOp.getTargetFlags());
	MIB.add(UseMI->getOperand(2));
	OpStart = 3;
	} else
	llvm_unreachable("Unhandled instruction");

	for (unsigned i = OpStart; i < OpEnd; ++i)
	MIB.add(UseMI->getOperand(i));

	Deleted.insert(UseMI);
	}

	return true;
	}

	bool HexagonOptAddrMode::xformUseMI(MachineInstr TfrMI, MachineInstr UseMI,
	NodeAddr<UseNode *> UseN,
	unsigned UseMOnum) {
	const MachineOperand ImmOp = TfrMI->getOperand(1);
	const MCInstrDesc &MID = UseMI->getDesc();
	unsigned Changed = false;
	if (MID.mayLoad())
	Changed = changeLoad(UseMI, ImmOp, UseMOnum);
	else if (MID.mayStore())
	Changed = changeStore(UseMI, ImmOp, UseMOnum);
	else if (UseMI->getOpcode() == Hexagon::S2_addasl_rrri)
	Changed = changeAddAsl(UseN, UseMI, ImmOp, UseMOnum);

	if (Changed)
	Deleted.insert(UseMI);

	return Changed;
	}

	bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
	bool Changed = false;

	for (auto IA : BA.Addr->members(*DFG)) {
	if (!DFG->IsCode<NodeAttrs::Stmt>(IA))
	continue;

	NodeAddr<StmtNode *> SA = IA;
	MachineInstr *MI = SA.Addr->getCode();
	if ((MI->getOpcode() != Hexagon::A2_tfrsi \|\|
	!MI->getOperand(1).isGlobal()) &&
	(MI->getOpcode() != Hexagon::A2_addi \|\|
	!MI->getOperand(2).isImm() \|\| HII->isConstExtended(*MI)))
	continue;

	LLVM_DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode())
	<< "]: " << *MI << "\n\t[InstrNode]: "
	<< Print<NodeAddr<InstrNode >>(IA, DFG) << '\n');

	NodeList UNodeList;
	getAllRealUses(SA, UNodeList);

	if (!allValidCandidates(SA, UNodeList))
	continue;

	// Analyze all uses of 'add'. If the output of 'add' is used as an address
	// in the base+immediate addressing mode load/store instructions, see if
	// they can be updated to use the immediate value as an offet. Thus,
	// providing us the opportunity to eliminate 'add'.
	// Ex: Rx= add(Rt,#12)
	// memw(Rx+#0) = Rs
	// This can be replaced with memw(Rt+#12) = Rs
	//
	// This transformation is only performed if all uses can be updated and
	// the offset isn't required to be constant extended.
	if (MI->getOpcode() == Hexagon::A2_addi) {
	Changed \|= processAddUses(SA, MI, UNodeList);
	continue;
	}

	short SizeInc = 0;
	Register DefR = MI->getOperand(0).getReg();
	InstrEvalMap InstrEvalResult;

	// Analyze all uses and calculate increase in size. Perform the optimization
	// only if there is no increase in size.
	if (!analyzeUses(DefR, UNodeList, InstrEvalResult, SizeInc))
	continue;
	if (SizeInc > CodeGrowthLimit)
	continue;

	bool KeepTfr = false;

	LLVM_DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size()
	<< "\n");
	LLVM_DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
	for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
	NodeAddr<UseNode > UseN = I;
	assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
	"Found a PhiRef node as a real reached use!!");

	NodeAddr<StmtNode > OwnerN = UseN.Addr->getOwner(DFG);
	MachineInstr *UseMI = OwnerN.Addr->getCode();
	LLVM_DEBUG(dbgs() << "\t\t[MI <" << printMBBReference(*UseMI->getParent())
	<< ">]: " << *UseMI << "\n");

	int UseMOnum = -1;
	unsigned NumOperands = UseMI->getNumOperands();
	for (unsigned j = 0; j < NumOperands - 1; ++j) {
	const MachineOperand &op = UseMI->getOperand(j);
	if (op.isReg() && op.isUse() && DefR == op.getReg())
	UseMOnum = j;
	}
	// It is possible that the register will not be found in any operand.
	// This could happen, for example, when DefR = R4, but the used
	// register is D2.

	// Change UseMI if replacement is possible. If any replacement failed,
	// or wasn't attempted, make sure to keep the TFR.
	bool Xformed = false;
	if (UseMOnum >= 0 && InstrEvalResult[UseMI])
	Xformed = xformUseMI(MI, UseMI, UseN, UseMOnum);
	Changed \|= Xformed;
	KeepTfr \|= !Xformed;
	}
	if (!KeepTfr)
	Deleted.insert(MI);
	}
	return Changed;
	}

	bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;

	bool Changed = false;
	auto &HST = MF.getSubtarget<HexagonSubtarget>();
	MRI = &MF.getRegInfo();
	HII = HST.getInstrInfo();
	HRI = HST.getRegisterInfo();
	const auto &MDF = getAnalysis<MachineDominanceFrontier>();
	MDT = &getAnalysis<MachineDominatorTree>();
	const TargetOperandInfo TOI(*HII);

	DataFlowGraph G(MF, HII, HRI, *MDT, MDF, TOI);
	// Need to keep dead phis because we can propagate uses of registers into
	// nodes dominated by those would-be phis.
	G.build(BuildOptions::KeepDeadPhis);
	DFG = &G;

	Liveness L(MRI, DFG);
	L.computePhiInfo();
	LV = &L;

	Deleted.clear();
	NodeAddr<FuncNode *> FA = DFG->getFunc();
	LLVM_DEBUG(dbgs() << "==== [RefMap#]=====:\n "
	<< Print<NodeAddr<FuncNode >>(FA, DFG) << "\n");

	for (NodeAddr<BlockNode > BA : FA.Addr->members(DFG))
	Changed \|= processBlock(BA);

	for (auto MI : Deleted)
	MI->eraseFromParent();

	if (Changed) {
	G.build();
	L.computeLiveIns();
	L.resetLiveIns();
	L.resetKills();
	}

	return Changed;
	}

	//===----------------------------------------------------------------------===//
	// Public Constructor Functions
	//===----------------------------------------------------------------------===//

	FunctionPass *llvm::createHexagonOptAddrMode() {
	return new HexagonOptAddrMode();
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp (revision 362609)
	@@ -1,341 +1,341 @@
	//===- HexagonRDFOpt.cpp --------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "HexagonInstrInfo.h"
	#include "HexagonSubtarget.h"
	#include "MCTargetDesc/HexagonBaseInfo.h"
	#include "RDFCopy.h"
	#include "RDFDeadCode.h"
	-#include "RDFGraph.h"
	-#include "RDFLiveness.h"
	-#include "RDFRegisters.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/CodeGen/MachineDominanceFrontier.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <limits>
	#include <utility>

	using namespace llvm;
	using namespace rdf;

	namespace llvm {

	void initializeHexagonRDFOptPass(PassRegistry&);
	FunctionPass *createHexagonRDFOpt();

	} // end namespace llvm

	static unsigned RDFCount = 0;

	static cl::opt<unsigned> RDFLimit("rdf-limit",
	cl::init(std::numeric_limits<unsigned>::max()));
	static cl::opt<bool> RDFDump("rdf-dump", cl::init(false));

	namespace {

	class HexagonRDFOpt : public MachineFunctionPass {
	public:
	HexagonRDFOpt() : MachineFunctionPass(ID) {}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<MachineDominatorTree>();
	AU.addRequired<MachineDominanceFrontier>();
	AU.setPreservesAll();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	StringRef getPassName() const override {
	return "Hexagon RDF optimizations";
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	MachineFunctionProperties getRequiredProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}

	static char ID;

	private:
	MachineDominatorTree *MDT;
	MachineRegisterInfo *MRI;
	};

	struct HexagonCP : public CopyPropagation {
	HexagonCP(DataFlowGraph &G) : CopyPropagation(G) {}

	bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) override;
	};

	struct HexagonDCE : public DeadCodeElimination {
	HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI)
	: DeadCodeElimination(G, MRI) {}

	bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove);
	void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum);

	bool run();
	};

	} // end anonymous namespace

	char HexagonRDFOpt::ID = 0;

	INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "hexagon-rdf-opt",
	"Hexagon RDF optimizations", false, false)
	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
	INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
	INITIALIZE_PASS_END(HexagonRDFOpt, "hexagon-rdf-opt",
	"Hexagon RDF optimizations", false, false)

	bool HexagonCP::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
	auto mapRegs = [&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
	EM.insert(std::make_pair(DstR, SrcR));
	};

	DataFlowGraph &DFG = getDFG();
	unsigned Opc = MI->getOpcode();
	switch (Opc) {
	case Hexagon::A2_combinew: {
	const MachineOperand &DstOp = MI->getOperand(0);
	const MachineOperand &HiOp = MI->getOperand(1);
	const MachineOperand &LoOp = MI->getOperand(2);
	assert(DstOp.getSubReg() == 0 && "Unexpected subregister");
	mapRegs(DFG.makeRegRef(DstOp.getReg(), Hexagon::isub_hi),
	DFG.makeRegRef(HiOp.getReg(), HiOp.getSubReg()));
	mapRegs(DFG.makeRegRef(DstOp.getReg(), Hexagon::isub_lo),
	DFG.makeRegRef(LoOp.getReg(), LoOp.getSubReg()));
	return true;
	}
	case Hexagon::A2_addi: {
	const MachineOperand &A = MI->getOperand(2);
	if (!A.isImm() \|\| A.getImm() != 0)
	return false;
	LLVM_FALLTHROUGH;
	}
	case Hexagon::A2_tfr: {
	const MachineOperand &DstOp = MI->getOperand(0);
	const MachineOperand &SrcOp = MI->getOperand(1);
	mapRegs(DFG.makeRegRef(DstOp.getReg(), DstOp.getSubReg()),
	DFG.makeRegRef(SrcOp.getReg(), SrcOp.getSubReg()));
	return true;
	}
	}

	return CopyPropagation::interpretAsCopy(MI, EM);
	}

	bool HexagonDCE::run() {
	bool Collected = collect();
	if (!Collected)
	return false;

	const SetVector<NodeId> &DeadNodes = getDeadNodes();
	const SetVector<NodeId> &DeadInstrs = getDeadInstrs();

	using RefToInstrMap = DenseMap<NodeId, NodeId>;

	RefToInstrMap R2I;
	SetVector<NodeId> PartlyDead;
	DataFlowGraph &DFG = getDFG();

	for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
	for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) {
	NodeAddr<StmtNode*> SA = TA;
	for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) {
	R2I.insert(std::make_pair(RA.Id, SA.Id));
	if (DFG.IsDef(RA) && DeadNodes.count(RA.Id))
	if (!DeadInstrs.count(SA.Id))
	PartlyDead.insert(SA.Id);
	}
	}
	}

	// Nodes to remove.
	SetVector<NodeId> Remove = DeadInstrs;

	bool Changed = false;
	for (NodeId N : PartlyDead) {
	auto SA = DFG.addr<StmtNode*>(N);
	if (trace())
	dbgs() << "Partly dead: " << *SA.Addr->getCode();
	Changed \|= rewrite(SA, Remove);
	}

	return erase(Remove) \|\| Changed;
	}

	void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) {
	MachineInstr MI = NodeAddr<StmtNode>(IA).Addr->getCode();

	auto getOpNum = [MI] (MachineOperand &Op) -> unsigned {
	for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i)
	if (&MI->getOperand(i) == &Op)
	return i;
	llvm_unreachable("Invalid operand");
	};
	DenseMap<NodeId,unsigned> OpMap;
	DataFlowGraph &DFG = getDFG();
	NodeList Refs = IA.Addr->members(DFG);
	for (NodeAddr<RefNode*> RA : Refs)
	OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp())));

	MI->RemoveOperand(OpNum);

	for (NodeAddr<RefNode*> RA : Refs) {
	unsigned N = OpMap[RA.Id];
	if (N < OpNum)
	RA.Addr->setRegRef(&MI->getOperand(N), DFG);
	else if (N > OpNum)
	RA.Addr->setRegRef(&MI->getOperand(N-1), DFG);
	}
	}

	bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) {
	if (!getDFG().IsCode<NodeAttrs::Stmt>(IA))
	return false;
	DataFlowGraph &DFG = getDFG();
	MachineInstr &MI = NodeAddr<StmtNode>(IA).Addr->getCode();
	auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII());
	if (HII.getAddrMode(MI) != HexagonII::PostInc)
	return false;
	unsigned Opc = MI.getOpcode();
	unsigned OpNum, NewOpc;
	switch (Opc) {
	case Hexagon::L2_loadri_pi:
	NewOpc = Hexagon::L2_loadri_io;
	OpNum = 1;
	break;
	case Hexagon::L2_loadrd_pi:
	NewOpc = Hexagon::L2_loadrd_io;
	OpNum = 1;
	break;
	case Hexagon::V6_vL32b_pi:
	NewOpc = Hexagon::V6_vL32b_ai;
	OpNum = 1;
	break;
	case Hexagon::S2_storeri_pi:
	NewOpc = Hexagon::S2_storeri_io;
	OpNum = 0;
	break;
	case Hexagon::S2_storerd_pi:
	NewOpc = Hexagon::S2_storerd_io;
	OpNum = 0;
	break;
	case Hexagon::V6_vS32b_pi:
	NewOpc = Hexagon::V6_vS32b_ai;
	OpNum = 0;
	break;
	default:
	return false;
	}
	auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool {
	return getDeadNodes().count(DA.Id);
	};
	NodeList Defs;
	MachineOperand &Op = MI.getOperand(OpNum);
	for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) {
	if (&DA.Addr->getOp() != &Op)
	continue;
	Defs = DFG.getRelatedRefs(IA, DA);
	if (!llvm::all_of(Defs, IsDead))
	return false;
	break;
	}

	// Mark all nodes in Defs for removal.
	for (auto D : Defs)
	Remove.insert(D.Id);

	if (trace())
	dbgs() << "Rewriting: " << MI;
	MI.setDesc(HII.get(NewOpc));
	MI.getOperand(OpNum+2).setImm(0);
	removeOperand(IA, OpNum);
	if (trace())
	dbgs() << " to: " << MI;

	return true;
	}

	bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;

	if (RDFLimit.getPosition()) {
	if (RDFCount >= RDFLimit)
	return false;
	RDFCount++;
	}

	MDT = &getAnalysis<MachineDominatorTree>();
	const auto &MDF = getAnalysis<MachineDominanceFrontier>();
	const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
	const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
	MRI = &MF.getRegInfo();
	bool Changed;

	if (RDFDump)
	MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr);

	TargetOperandInfo TOI(HII);
	DataFlowGraph G(MF, HII, HRI, *MDT, MDF, TOI);
	// Dead phi nodes are necessary for copy propagation: we can add a use
	// of a register in a block where it would need a phi node, but which
	// was dead (and removed) during the graph build time.
	G.build(BuildOptions::KeepDeadPhis);

	if (RDFDump)
	dbgs() << "Starting copy propagation on: " << MF.getName() << '\n'
	<< PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
	HexagonCP CP(G);
	CP.trace(RDFDump);
	Changed = CP.run();

	if (RDFDump)
	dbgs() << "Starting dead code elimination on: " << MF.getName() << '\n'
	<< PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
	HexagonDCE DCE(G, *MRI);
	DCE.trace(RDFDump);
	Changed \|= DCE.run();

	if (Changed) {
	if (RDFDump)
	dbgs() << "Starting liveness recomputation on: " << MF.getName() << '\n';
	Liveness LV(*MRI, G);
	LV.trace(RDFDump);
	LV.computeLiveIns();
	LV.resetLiveIns();
	LV.resetKills();
	}

	if (RDFDump)
	MF.print(dbgs() << "After " << getPassName() << "\n", nullptr);

	return false;
	}

	FunctionPass *llvm::createHexagonRDFOpt() {
	return new HexagonRDFOpt();
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFCopy.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFCopy.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFCopy.cpp (revision 362609)
	@@ -1,213 +1,213 @@
	//===- RDFCopy.cpp --------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// RDF-based copy propagation.
	//
	//===----------------------------------------------------------------------===//

	#include "RDFCopy.h"
	-#include "RDFGraph.h"
	-#include "RDFLiveness.h"
	-#include "RDFRegisters.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cstdint>
	#include <utility>

	using namespace llvm;
	using namespace rdf;

	#ifndef NDEBUG
	static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden);
	static unsigned CpCount = 0;
	#endif

	bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
	unsigned Opc = MI->getOpcode();
	switch (Opc) {
	case TargetOpcode::COPY: {
	const MachineOperand &Dst = MI->getOperand(0);
	const MachineOperand &Src = MI->getOperand(1);
	RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
	RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
	assert(Register::isPhysicalRegister(DstR.Reg));
	assert(Register::isPhysicalRegister(SrcR.Reg));
	const TargetRegisterInfo &TRI = DFG.getTRI();
	if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
	TRI.getMinimalPhysRegClass(SrcR.Reg))
	return false;
	EM.insert(std::make_pair(DstR, SrcR));
	return true;
	}
	case TargetOpcode::REG_SEQUENCE:
	llvm_unreachable("Unexpected REG_SEQUENCE");
	}
	return false;
	}

	void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
	CopyMap.insert(std::make_pair(SA.Id, EM));
	Copies.push_back(SA.Id);
	}

	bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
	bool Changed = false;
	NodeAddr<BlockNode*> BA = DFG.findBlock(B);

	for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
	if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
	NodeAddr<StmtNode*> SA = IA;
	EqualityMap EM;
	if (interpretAsCopy(SA.Addr->getCode(), EM))
	recordCopy(SA, EM);
	}
	}

	MachineDomTreeNode *N = MDT.getNode(B);
	for (auto I : *N)
	Changed \|= scanBlock(I->getBlock());

	return Changed;
	}

	NodeId CopyPropagation::getLocalReachingDef(RegisterRef RefRR,
	NodeAddr<InstrNode*> IA) {
	NodeAddr<RefNode*> RA = L.getNearestAliasedRef(RefRR, IA);
	if (RA.Id != 0) {
	if (RA.Addr->getKind() == NodeAttrs::Def)
	return RA.Id;
	assert(RA.Addr->getKind() == NodeAttrs::Use);
	if (NodeId RD = RA.Addr->getReachingDef())
	return RD;
	}
	return 0;
	}

	bool CopyPropagation::run() {
	scanBlock(&DFG.getMF().front());

	if (trace()) {
	dbgs() << "Copies:\n";
	for (NodeId I : Copies) {
	dbgs() << "Instr: " << DFG.addr<StmtNode>(I).Addr->getCode();
	dbgs() << " eq: {";
	for (auto J : CopyMap[I])
	dbgs() << ' ' << Print<RegisterRef>(J.first, DFG) << '='
	<< Print<RegisterRef>(J.second, DFG);
	dbgs() << " }\n";
	}
	}

	bool Changed = false;
	#ifndef NDEBUG
	bool HasLimit = CpLimit.getNumOccurrences() > 0;
	#endif

	auto MinPhysReg = [this] (RegisterRef RR) -> unsigned {
	const TargetRegisterInfo &TRI = DFG.getTRI();
	const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
	if ((RC.LaneMask & RR.Mask) == RC.LaneMask)
	return RR.Reg;
	for (MCSubRegIndexIterator S(RR.Reg, &TRI); S.isValid(); ++S)
	if (RR.Mask == TRI.getSubRegIndexLaneMask(S.getSubRegIndex()))
	return S.getSubReg();
	llvm_unreachable("Should have found a register");
	return 0;
	};

	for (NodeId C : Copies) {
	#ifndef NDEBUG
	if (HasLimit && CpCount >= CpLimit)
	break;
	#endif
	auto SA = DFG.addr<InstrNode*>(C);
	auto FS = CopyMap.find(SA.Id);
	if (FS == CopyMap.end())
	continue;

	EqualityMap &EM = FS->second;
	for (NodeAddr<DefNode*> DA : SA.Addr->members_if(DFG.IsDef, DFG)) {
	RegisterRef DR = DA.Addr->getRegRef(DFG);
	auto FR = EM.find(DR);
	if (FR == EM.end())
	continue;
	RegisterRef SR = FR->second;
	if (DR == SR)
	continue;

	NodeId AtCopy = getLocalReachingDef(SR, SA);

	for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
	auto UA = DFG.addr<UseNode*>(N);
	NextN = UA.Addr->getSibling();
	uint16_t F = UA.Addr->getFlags();
	if ((F & NodeAttrs::PhiRef) \|\| (F & NodeAttrs::Fixed))
	continue;
	if (UA.Addr->getRegRef(DFG) != DR)
	continue;

	NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
	assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
	NodeId AtUse = getLocalReachingDef(SR, IA);
	if (AtCopy != AtUse)
	continue;

	MachineOperand &Op = UA.Addr->getOp();
	if (Op.isTied())
	continue;
	if (trace()) {
	dbgs() << "Can replace " << Print<RegisterRef>(DR, DFG)
	<< " with " << Print<RegisterRef>(SR, DFG) << " in "
	<< NodeAddr<StmtNode>(IA).Addr->getCode();
	}

	unsigned NewReg = MinPhysReg(SR);
	Op.setReg(NewReg);
	Op.setSubReg(0);
	DFG.unlinkUse(UA, false);
	if (AtCopy != 0) {
	UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(AtCopy));
	} else {
	UA.Addr->setReachingDef(0);
	UA.Addr->setSibling(0);
	}

	Changed = true;
	#ifndef NDEBUG
	if (HasLimit && CpCount >= CpLimit)
	break;
	CpCount++;
	#endif

	auto FC = CopyMap.find(IA.Id);
	if (FC != CopyMap.end()) {
	// Update the EM map in the copy's entry.
	auto &M = FC->second;
	for (auto &J : M) {
	if (J.second != DR)
	continue;
	J.second = SR;
	break;
	}
	}
	} // for (N in reached-uses)
	} // for (DA in defs)
	} // for (C in Copies)

	return Changed;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFCopy.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFCopy.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFCopy.h (revision 362609)
	@@ -1,61 +1,61 @@
	//===- RDFCopy.h ------------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
	#define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H

	-#include "RDFGraph.h"
	-#include "RDFLiveness.h"
	-#include "RDFRegisters.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	+#include "llvm/CodeGen/RDFRegisters.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include <map>
	#include <vector>

	namespace llvm {

	class MachineBasicBlock;
	class MachineDominatorTree;
	class MachineInstr;

	namespace rdf {

	struct CopyPropagation {
	CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
	L(dfg.getMF().getRegInfo(), dfg) {}

	virtual ~CopyPropagation() = default;

	bool run();
	void trace(bool On) { Trace = On; }
	bool trace() const { return Trace; }
	DataFlowGraph &getDFG() { return DFG; }

	using EqualityMap = std::map<RegisterRef, RegisterRef>;

	virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM);

	private:
	const MachineDominatorTree &MDT;
	DataFlowGraph &DFG;
	Liveness L;
	bool Trace = false;

	// map: statement -> (map: dst reg -> src reg)
	std::map<NodeId, EqualityMap> CopyMap;
	std::vector<NodeId> Copies;

	void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
	bool scanBlock(MachineBasicBlock *B);
	NodeId getLocalReachingDef(RegisterRef RefRR, NodeAddr<InstrNode*> IA);
	};

	} // end namespace rdf

	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp (revision 362609)
	@@ -1,243 +1,243 @@
	//===--- RDFDeadCode.cpp --------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// RDF-based generic dead code elimination.

	#include "RDFDeadCode.h"
	-#include "RDFGraph.h"
	-#include "RDFLiveness.h"

	#include "llvm/ADT/SetVector.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	#include "llvm/Support/Debug.h"

	#include <queue>

	using namespace llvm;
	using namespace rdf;

	// This drastically improves execution time in "collect" over using
	// SetVector as a work queue, and popping the first element from it.
	template<typename T> struct DeadCodeElimination::SetQueue {
	SetQueue() : Set(), Queue() {}

	bool empty() const {
	return Queue.empty();
	}
	T pop_front() {
	T V = Queue.front();
	Queue.pop();
	Set.erase(V);
	return V;
	}
	void push_back(T V) {
	if (Set.count(V))
	return;
	Queue.push(V);
	Set.insert(V);
	}

	private:
	DenseSet<T> Set;
	std::queue<T> Queue;
	};


	// Check if the given instruction has observable side-effects, i.e. if
	// it should be considered "live". It is safe for this function to be
	// overly conservative (i.e. return "true" for all instructions), but it
	// is not safe to return "false" for an instruction that should not be
	// considered removable.
	bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
	if (MI->mayStore() \|\| MI->isBranch() \|\| MI->isCall() \|\| MI->isReturn())
	return true;
	if (MI->hasOrderedMemoryRef() \|\| MI->hasUnmodeledSideEffects() \|\|
	MI->isPosition())
	return true;
	if (MI->isPHI())
	return false;
	for (auto &Op : MI->operands()) {
	if (Op.isReg() && MRI.isReserved(Op.getReg()))
	return true;
	if (Op.isRegMask()) {
	const uint32_t *BM = Op.getRegMask();
	for (unsigned R = 0, RN = DFG.getTRI().getNumRegs(); R != RN; ++R) {
	if (BM[R/32] & (1u << (R%32)))
	continue;
	if (MRI.isReserved(R))
	return true;
	}
	}
	}
	return false;
	}

	void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA,
	SetQueue<NodeId> &WorkQ) {
	if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
	return;
	if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
	return;
	for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) {
	if (!LiveNodes.count(RA.Id))
	WorkQ.push_back(RA.Id);
	}
	}

	void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA,
	SetQueue<NodeId> &WorkQ) {
	NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
	for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
	if (!LiveNodes.count(UA.Id))
	WorkQ.push_back(UA.Id);
	}
	for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA))
	LiveNodes.insert(TA.Id);
	}

	void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA,
	SetQueue<NodeId> &WorkQ) {
	for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) {
	if (!LiveNodes.count(DA.Id))
	WorkQ.push_back(DA.Id);
	}
	}

	// Traverse the DFG and collect the set dead RefNodes and the set of
	// dead instructions. Return "true" if any of these sets is non-empty,
	// "false" otherwise.
	bool DeadCodeElimination::collect() {
	// This function works by first finding all live nodes. The dead nodes
	// are then the complement of the set of live nodes.
	//
	// Assume that all nodes are dead. Identify instructions which must be
	// considered live, i.e. instructions with observable side-effects, such
	// as calls and stores. All arguments of such instructions are considered
	// live. For each live def, all operands used in the corresponding
	// instruction are considered live. For each live use, all its reaching
	// defs are considered live.
	LiveNodes.clear();
	SetQueue<NodeId> WorkQ;
	for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG))
	for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG))
	scanInstr(IA, WorkQ);

	while (!WorkQ.empty()) {
	NodeId N = WorkQ.pop_front();
	LiveNodes.insert(N);
	auto RA = DFG.addr<RefNode*>(N);
	if (DFG.IsDef(RA))
	processDef(RA, WorkQ);
	else
	processUse(RA, WorkQ);
	}

	if (trace()) {
	dbgs() << "Live nodes:\n";
	for (NodeId N : LiveNodes) {
	auto RA = DFG.addr<RefNode*>(N);
	dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n";
	}
	}

	auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool {
	for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG))
	if (LiveNodes.count(DA.Id))
	return false;
	return true;
	};

	for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
	for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
	for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
	if (!LiveNodes.count(RA.Id))
	DeadNodes.insert(RA.Id);
	if (DFG.IsCode<NodeAttrs::Stmt>(IA))
	if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
	continue;
	if (IsDead(IA)) {
	DeadInstrs.insert(IA.Id);
	if (trace())
	dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n";
	}
	}
	}

	return !DeadNodes.empty();
	}

	// Erase the nodes given in the Nodes set from DFG. In addition to removing
	// them from the DFG, if a node corresponds to a statement, the corresponding
	// machine instruction is erased from the function.
	bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
	if (Nodes.empty())
	return false;

	// Prepare the actual set of ref nodes to remove: ref nodes from Nodes
	// are included directly, for each InstrNode in Nodes, include the set
	// of all RefNodes from it.
	NodeList DRNs, DINs;
	for (auto I : Nodes) {
	auto BA = DFG.addr<NodeBase*>(I);
	uint16_t Type = BA.Addr->getType();
	if (Type == NodeAttrs::Ref) {
	DRNs.push_back(DFG.addr<RefNode*>(I));
	continue;
	}

	// If it's a code node, add all ref nodes from it.
	uint16_t Kind = BA.Addr->getKind();
	if (Kind == NodeAttrs::Stmt \|\| Kind == NodeAttrs::Phi) {
	for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
	DRNs.push_back(N);
	DINs.push_back(DFG.addr<InstrNode*>(I));
	} else {
	llvm_unreachable("Unexpected code node");
	return false;
	}
	}

	// Sort the list so that use nodes are removed first. This makes the
	// "unlink" functions a bit faster.
	auto UsesFirst = [] (NodeAddr<RefNode> A, NodeAddr<RefNode> B) -> bool {
	uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind();
	if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def)
	return true;
	if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use)
	return false;
	return A.Id < B.Id;
	};
	llvm::sort(DRNs, UsesFirst);

	if (trace())
	dbgs() << "Removing dead ref nodes:\n";
	for (NodeAddr<RefNode*> RA : DRNs) {
	if (trace())
	dbgs() << " " << PrintNode<RefNode*>(RA, DFG) << '\n';
	if (DFG.IsUse(RA))
	DFG.unlinkUse(RA, true);
	else if (DFG.IsDef(RA))
	DFG.unlinkDef(RA, true);
	}

	// Now, remove all dead instruction nodes.
	for (NodeAddr<InstrNode*> IA : DINs) {
	NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
	BA.Addr->removeMember(IA, DFG);
	if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
	continue;

	MachineInstr MI = NodeAddr<StmtNode>(IA).Addr->getCode();
	if (trace())
	dbgs() << "erasing: " << *MI;
	MI->eraseFromParent();
	}
	return true;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.h (revision 362609)
	@@ -1,66 +1,66 @@
	//===--- RDFDeadCode.h ----------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// RDF-based generic dead code elimination.
	//
	// The main interface of this class are functions "collect" and "erase".
	// This allows custom processing of the function being optimized by a
	// particular consumer. The simplest way to use this class would be to
	// instantiate an object, and then simply call "collect" and "erase",
	// passing the result of "getDeadInstrs()" to it.
	// A more complex scenario would be to call "collect" first, then visit
	// all post-increment instructions to see if the address update is dead
	// or not, and if it is, convert the instruction to a non-updating form.
	// After that "erase" can be called with the set of nodes including both,
	// dead defs from the updating instructions and the nodes corresponding
	// to the dead instructions.

	#ifndef RDF_DEADCODE_H
	#define RDF_DEADCODE_H

	-#include "RDFGraph.h"
	-#include "RDFLiveness.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	#include "llvm/ADT/SetVector.h"

	namespace llvm {
	class MachineRegisterInfo;

	namespace rdf {
	struct DeadCodeElimination {
	DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri)
	: Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {}

	bool collect();
	bool erase(const SetVector<NodeId> &Nodes);
	void trace(bool On) { Trace = On; }
	bool trace() const { return Trace; }

	SetVector<NodeId> getDeadNodes() { return DeadNodes; }
	SetVector<NodeId> getDeadInstrs() { return DeadInstrs; }
	DataFlowGraph &getDFG() { return DFG; }

	private:
	bool Trace;
	SetVector<NodeId> LiveNodes;
	SetVector<NodeId> DeadNodes;
	SetVector<NodeId> DeadInstrs;
	DataFlowGraph &DFG;
	MachineRegisterInfo &MRI;
	Liveness LV;

	template<typename T> struct SetQueue;

	bool isLiveInstr(const MachineInstr *MI) const;
	void scanInstr(NodeAddr<InstrNode*> IA, SetQueue<NodeId> &WorkQ);
	void processDef(NodeAddr<DefNode*> DA, SetQueue<NodeId> &WorkQ);
	void processUse(NodeAddr<UseNode*> UA, SetQueue<NodeId> &WorkQ);
	};
	} // namespace rdf
	} // namespace llvm

	#endif
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td (revision 362609)
	@@ -1,1430 +1,1431 @@
	//===- P9InstrResources.td - P9 Instruction Resource Defs -- tablegen --==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the resources required by P9 instructions. This is part of
	// the P9 processor model used for instruction scheduling. This file should
	// contain all the instructions that may be used on Power 9. This is not
	// just instructions that are new on Power 9 but also instructions that were
	// available on earlier architectures and are still used in Power 9.
	//
	// The makeup of the P9 CPU is modeled as follows:
	// - Each CPU is made up of two superslices.
	// - Each superslice is made up of two slices. Therefore, there are 4 slices
	// for each CPU.
	// - Up to 6 instructions can be dispatched to each CPU. Three per superslice.
	// - Each CPU has:
	// - One CY (Crypto) unit P9_CY_*
	// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_*
	// - Two PM (Permute) units. One on each superslice. P9_PM_*
	// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_*
	// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_*
	// - Four DP (Floating Point) units. One on each slice. P9_DP_*
	// This also includes fixed point multiply add.
	// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_*
	// - Four Load/Store Queues. P9_LS_*
	// - Each set of instructions will require a number of these resources.
	//===----------------------------------------------------------------------===//

	// Two cycle ALU vector operation that uses an entire superslice.
	// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
	// (EXECE, EXECO) and 1 dispatch (DISP) to the given superslice.
	def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	(instregex "VADDU(B\|H\|W\|D)M$"),
	(instregex "VAND(C)?$"),
	(instregex "VEXTS(B\|H\|W)2(D\|W)(s)?$"),
	(instregex "V_SET0(B\|H)?$"),
	(instregex "VS(R\|L)(B\|H\|W\|D)$"),
	(instregex "VSUBU(B\|H\|W\|D)M$"),
	(instregex "VPOPCNT(B\|H)$"),
	(instregex "VRL(B\|H\|W\|D)$"),
	(instregex "VSRA(B\|H\|W\|D)$"),
	(instregex "XV(N)?ABS(D\|S)P$"),
	(instregex "XVCPSGN(D\|S)P$"),
	(instregex "XV(I\|X)EXP(D\|S)P$"),
	(instregex "VRL(D\|W)(MI\|NM)$"),
	(instregex "VMRG(E\|O)W$"),
	MTVSRDD,
	VEQV,
	VNAND,
	VNEGD,
	VNEGW,
	VNOR,
	VOR,
	VORC,
	VSEL,
	VXOR,
	XVNEGDP,
	XVNEGSP,
	XXLAND,
	XXLANDC,
	XXLEQV,
	XXLEQVOnes,
	XXLNAND,
	XXLNOR,
	XXLOR,
	XXLORf,
	XXLORC,
	XXLXOR,
	XXLXORdpz,
	XXLXORspz,
	XXLXORz,
	XXSEL,
	XSABSQP,
	XSCPSGNQP,
	XSIEXPQP,
	XSNABSQP,
	XSNEGQP,
	XSXEXPQP
	)>;

	// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
	// single slice. However, since it is Restricted, it requires all 3 dispatches
	// (DISP) for that superslice.
	def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "TABORT(D\|W)C(I)?$"),
	(instregex "MTFSB(0\|1)$"),
	(instregex "MFFSC(D)?RN(I)?$"),
	(instregex "CMPRB(8)?$"),
	(instregex "TD(I)?$"),
	(instregex "TW(I)?$"),
	(instregex "FCMPU(S\|D)$"),
	(instregex "XSTSTDC(S\|D)P$"),
	FTDIV,
	FTSQRT,
	CMPEQB
	)>;

	// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
	def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C],
	(instrs
	(instregex "XSMAX(C\|J)?DP$"),
	(instregex "XSMIN(C\|J)?DP$"),
	(instregex "XSCMP(EQ\|EXP\|GE\|GT\|O\|U)DP$"),
	(instregex "CNT(L\|T)Z(D\|W)(8)?(_rec)?$"),
	(instregex "POPCNT(D\|W)$"),
	(instregex "CMPB(8)?$"),
	(instregex "SETB(8)?$"),
	XSTDIVDP,
	XSTSQRTDP,
	XSXSIGDP,
	XSCVSPDPN,
	BPERMD
	)>;

	// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
	def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
	(instrs
	(instregex "S(L\|R)D$"),
	(instregex "SRAD(I)?$"),
	(instregex "EXTSWSLI_32_64$"),
	(instregex "MFV(S)?RD$"),
	(instregex "MTV(S)?RD$"),
	(instregex "MTV(S)?RW(A\|Z)$"),
	(instregex "CMP(WI\|LWI\|W\|LW)(8)?$"),
	(instregex "CMP(L)?D(I)?$"),
	(instregex "SUBF(I)?C(8)?(O)?$"),
	(instregex "ANDI(S)?(8)?(_rec)?$"),
	(instregex "ADDC(8)?(O)?$"),
	(instregex "ADDIC(8)?(_rec)?$"),
	(instregex "ADD(8\|4)(O)?(_rec)?$"),
	(instregex "ADD(E\|ME\|ZE)(8)?(O)?(_rec)?$"),
	(instregex "SUBF(E\|ME\|ZE)?(8)?(O)?(_rec)?$"),
	(instregex "NEG(8)?(O)?(_rec)?$"),
	(instregex "POPCNTB$"),
	(instregex "ADD(I\|IS)?(8)?$"),
	(instregex "LI(S)?(8)?$"),
	(instregex "(X)?OR(I\|IS)?(8)?(_rec)?$"),
	(instregex "NAND(8)?(_rec)?$"),
	(instregex "AND(C)?(8)?(_rec)?$"),
	(instregex "NOR(8)?(_rec)?$"),
	(instregex "OR(C)?(8)?(_rec)?$"),
	(instregex "EQV(8)?(_rec)?$"),
	(instregex "EXTS(B\|H\|W)(8)?(_32)?(_64)?(_rec)?$"),
	(instregex "ADD(4\|8)(TLS)?(_)?$"),
	(instregex "NEG(8)?(O)?$"),
	(instregex "ADDI(S)?toc(HA\|L)(8)?$"),
	COPY,
	MCRF,
	MCRXRX,
	XSNABSDP,
	XSXEXPDP,
	XSABSDP,
	XSNEGDP,
	XSCPSGNDP,
	MFVSRWZ,
	MFVRWZ,
	EXTSWSLI,
	SRADI_32,
	RLDIC,
	RFEBB,
	LA,
	TBEGIN,
	TRECHKPT,
	NOP,
	WAIT
	)>;

	// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
	// single slice. However, since it is Restricted, it requires all 3 dispatches
	// (DISP) for that superslice.
	def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "RLDC(L\|R)$"),
	(instregex "RLWIMI(8)?$"),
	(instregex "RLDIC(L\|R)(_32)?(_64)?$"),
	(instregex "M(F\|T)OCRF(8)?$"),
	(instregex "CR(6)?(UN)?SET$"),
	(instregex "CR(N)?(OR\|AND)(C)?$"),
	(instregex "S(L\|R)W(8)?$"),
	(instregex "RLW(INM\|NM)(8)?$"),
	(instregex "F(N)?ABS(D\|S)$"),
	(instregex "FNEG(D\|S)$"),
	(instregex "FCPSGN(D\|S)$"),
	(instregex "SRAW(I)?$"),
	(instregex "ISEL(8)?$"),
	RLDIMI,
	XSIEXPDP,
	FMR,
	CREQV,
	CRXOR,
	TRECLAIM,
	TSR,
	TABORT
	)>;

	// Three cycle ALU vector operation that uses an entire superslice.
	// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
	// (EXECE, EXECO) and 1 dispatch (DISP) to the given superslice.
	def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	(instregex "M(T\|F)VSCR$"),
	(instregex "VCMPNEZ(B\|H\|W)$"),
	(instregex "VCMPEQU(B\|H\|W\|D)$"),
	(instregex "VCMPNE(B\|H\|W)$"),
	(instregex "VABSDU(B\|H\|W)$"),
	(instregex "VADDU(B\|H\|W)S$"),
	(instregex "VAVG(S\|U)(B\|H\|W)$"),
	(instregex "VCMP(EQ\|GE\|GT)FP(_rec)?$"),
	(instregex "VCMPBFP(_rec)?$"),
	(instregex "VC(L\|T)Z(B\|H\|W\|D)$"),
	(instregex "VADDS(B\|H\|W)S$"),
	(instregex "V(MIN\|MAX)FP$"),
	(instregex "V(MIN\|MAX)(S\|U)(B\|H\|W\|D)$"),
	VBPERMD,
	VADDCUW,
	VPOPCNTW,
	VPOPCNTD,
	VPRTYBD,
	VPRTYBW,
	VSHASIGMAD,
	VSHASIGMAW,
	VSUBSBS,
	VSUBSHS,
	VSUBSWS,
	VSUBUBS,
	VSUBUHS,
	VSUBUWS,
	VSUBCUW,
	VCMPGTSB,
	VCMPGTSB_rec,
	VCMPGTSD,
	VCMPGTSD_rec,
	VCMPGTSH,
	VCMPGTSH_rec,
	VCMPGTSW,
	VCMPGTSW_rec,
	VCMPGTUB,
	VCMPGTUB_rec,
	VCMPGTUD,
	VCMPGTUD_rec,
	VCMPGTUH,
	VCMPGTUH_rec,
	VCMPGTUW,
	VCMPGTUW_rec,
	VCMPNEB_rec,
	VCMPNEH_rec,
	VCMPNEW_rec,
	VCMPNEZB_rec,
	VCMPNEZH_rec,
	VCMPNEZW_rec,
	VCMPEQUB_rec,
	VCMPEQUD_rec,
	VCMPEQUH_rec,
	VCMPEQUW_rec,
	XVCMPEQDP,
	XVCMPEQDP_rec,
	XVCMPEQSP,
	XVCMPEQSP_rec,
	XVCMPGEDP,
	XVCMPGEDP_rec,
	XVCMPGESP,
	XVCMPGESP_rec,
	XVCMPGTDP,
	XVCMPGTDP_rec,
	XVCMPGTSP,
	XVCMPGTSP_rec,
	XVMAXDP,
	XVMAXSP,
	XVMINDP,
	XVMINSP,
	XVTDIVDP,
	XVTDIVSP,
	XVTSQRTDP,
	XVTSQRTSP,
	XVTSTDCDP,
	XVTSTDCSP,
	XVXSIGDP,
	XVXSIGSP
	)>;

	// 7 cycle DP vector operation that uses an entire superslice.
	// Uses both DP units (the even DPE and odd DPO units), two pipelines (EXECE,
	// EXECO) and all three dispatches (DISP) to the given superslice.
	def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	VADDFP,
	VCTSXS,
	VCTSXS_0,
	VCTUXS,
	VCTUXS_0,
	VEXPTEFP,
	VLOGEFP,
	VMADDFP,
	VMHADDSHS,
	VNMSUBFP,
	VREFP,
	VRFIM,
	VRFIN,
	VRFIP,
	VRFIZ,
	VRSQRTEFP,
	VSUBFP,
	XVADDDP,
	XVADDSP,
	XVCVDPSP,
	XVCVDPSXDS,
	XVCVDPSXWS,
	XVCVDPUXDS,
	XVCVDPUXWS,
	XVCVHPSP,
	XVCVSPDP,
	XVCVSPHP,
	XVCVSPSXDS,
	XVCVSPSXWS,
	XVCVSPUXDS,
	XVCVSPUXWS,
	XVCVSXDDP,
	XVCVSXDSP,
	XVCVSXWDP,
	XVCVSXWSP,
	XVCVUXDDP,
	XVCVUXDSP,
	XVCVUXWDP,
	XVCVUXWSP,
	XVMADDADP,
	XVMADDASP,
	XVMADDMDP,
	XVMADDMSP,
	XVMSUBADP,
	XVMSUBASP,
	XVMSUBMDP,
	XVMSUBMSP,
	XVMULDP,
	XVMULSP,
	XVNMADDADP,
	XVNMADDASP,
	XVNMADDMDP,
	XVNMADDMSP,
	XVNMSUBADP,
	XVNMSUBASP,
	XVNMSUBMDP,
	XVNMSUBMSP,
	XVRDPI,
	XVRDPIC,
	XVRDPIM,
	XVRDPIP,
	XVRDPIZ,
	XVREDP,
	XVRESP,
	XVRSPI,
	XVRSPIC,
	XVRSPIM,
	XVRSPIP,
	XVRSPIZ,
	XVRSQRTEDP,
	XVRSQRTESP,
	XVSUBDP,
	XVSUBSP,
	VCFSX,
	VCFSX_0,
	VCFUX,
	VCFUX_0,
	VMHRADDSHS,
	VMLADDUHM,
	VMSUMMBM,
	VMSUMSHM,
	VMSUMSHS,
	VMSUMUBM,
	VMSUMUHM,
	+ VMSUMUDM,
	VMSUMUHS,
	VMULESB,
	VMULESH,
	VMULESW,
	VMULEUB,
	VMULEUH,
	VMULEUW,
	VMULOSB,
	VMULOSH,
	VMULOSW,
	VMULOUB,
	VMULOUH,
	VMULOUW,
	VMULUWM,
	VSUM2SWS,
	VSUM4SBS,
	VSUM4SHS,
	VSUM4UBS,
	VSUMSWS
	)>;

	// 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
	// dispatch units for the superslice.
	def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "MADD(HD\|HDU\|LD\|LD8)$"),
	(instregex "MUL(HD\|HW\|LD\|LI\|LI8\|LW)(U)?(O)?$")
	)>;

	// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
	// dispatch units for the superslice.
	def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	FRSP,
	(instregex "FRI(N\|P\|Z\|M)(D\|S)$"),
	(instregex "FRE(S)?$"),
	(instregex "FADD(S)?$"),
	(instregex "FMSUB(S)?$"),
	(instregex "FMADD(S)?$"),
	(instregex "FSUB(S)?$"),
	(instregex "FCFID(U)?(S)?$"),
	(instregex "FCTID(U)?(Z)?$"),
	(instregex "FCTIW(U)?(Z)?$"),
	(instregex "FRSQRTE(S)?$"),
	FNMADDS,
	FNMADD,
	FNMSUBS,
	FNMSUB,
	FSELD,
	FSELS,
	FMULS,
	FMUL,
	XSMADDADP,
	XSMADDASP,
	XSMADDMDP,
	XSMADDMSP,
	XSMSUBADP,
	XSMSUBASP,
	XSMSUBMDP,
	XSMSUBMSP,
	XSMULDP,
	XSMULSP,
	XSNMADDADP,
	XSNMADDASP,
	XSNMADDMDP,
	XSNMADDMSP,
	XSNMSUBADP,
	XSNMSUBASP,
	XSNMSUBMDP,
	XSNMSUBMSP
	)>;

	// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
	// These operations can be done in parallel. The DP is restricted so we need a
	// full 4 dispatches.
	def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "FSEL(D\|S)_rec$")
	)>;

	// 5 Cycle Restricted DP operation and one 2 cycle ALU operation.
	def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "MUL(H\|L)(D\|W)(U)?(O)?_rec$")
	)>;

	// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
	// These operations must be done sequentially.The DP is restricted so we need a
	// full 4 dispatches.
	def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "FRI(N\|P\|Z\|M)(D\|S)_rec$"),
	(instregex "FRE(S)?_rec$"),
	(instregex "FADD(S)?_rec$"),
	(instregex "FSUB(S)?_rec$"),
	(instregex "F(N)?MSUB(S)?_rec$"),
	(instregex "F(N)?MADD(S)?_rec$"),
	(instregex "FCFID(U)?(S)?_rec$"),
	(instregex "FCTID(U)?(Z)?_rec$"),
	(instregex "FCTIW(U)?(Z)?_rec$"),
	(instregex "FMUL(S)?_rec$"),
	(instregex "FRSQRTE(S)?_rec$"),
	FRSP_rec
	)>;

	// 7 cycle DP operation. One DP unit, one EXEC pipeline and 1 dispatch units.
	def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C],
	(instrs
	XSADDDP,
	XSADDSP,
	XSCVDPHP,
	XSCVDPSP,
	XSCVDPSXDS,
	XSCVDPSXDSs,
	XSCVDPSXWS,
	XSCVDPUXDS,
	XSCVDPUXDSs,
	XSCVDPUXWS,
	XSCVDPSXWSs,
	XSCVDPUXWSs,
	XSCVHPDP,
	XSCVSPDP,
	XSCVSXDDP,
	XSCVSXDSP,
	XSCVUXDDP,
	XSCVUXDSP,
	XSRDPI,
	XSRDPIC,
	XSRDPIM,
	XSRDPIP,
	XSRDPIZ,
	XSREDP,
	XSRESP,
	XSRSQRTEDP,
	XSRSQRTESP,
	XSSUBDP,
	XSSUBSP,
	XSCVDPSPN,
	XSRSP
	)>;

	// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
	(instrs
	(instregex "LVS(L\|R)$"),
	(instregex "VSPLTIS(W\|H\|B)$"),
	(instregex "VSPLT(W\|H\|B)(s)?$"),
	(instregex "V_SETALLONES(B\|H)?$"),
	(instregex "VEXTRACTU(B\|H\|W)$"),
	(instregex "VINSERT(B\|H\|W\|D)$"),
	MFVSRLD,
	MTVSRWS,
	VBPERMQ,
	VCLZLSBB,
	VCTZLSBB,
	VEXTRACTD,
	VEXTUBLX,
	VEXTUBRX,
	VEXTUHLX,
	VEXTUHRX,
	VEXTUWLX,
	VEXTUWRX,
	VGBBD,
	VMRGHB,
	VMRGHH,
	VMRGHW,
	VMRGLB,
	VMRGLH,
	VMRGLW,
	VPERM,
	VPERMR,
	VPERMXOR,
	VPKPX,
	VPKSDSS,
	VPKSDUS,
	VPKSHSS,
	VPKSHUS,
	VPKSWSS,
	VPKSWUS,
	VPKUDUM,
	VPKUDUS,
	VPKUHUM,
	VPKUHUS,
	VPKUWUM,
	VPKUWUS,
	VPRTYBQ,
	VSL,
	VSLDOI,
	VSLO,
	VSLV,
	VSR,
	VSRO,
	VSRV,
	VUPKHPX,
	VUPKHSB,
	VUPKHSH,
	VUPKHSW,
	VUPKLPX,
	VUPKLSB,
	VUPKLSH,
	VUPKLSW,
	XXBRD,
	XXBRH,
	XXBRQ,
	XXBRW,
	XXEXTRACTUW,
	XXINSERTW,
	XXMRGHW,
	XXMRGLW,
	XXPERM,
	XXPERMR,
	XXSLDWI,
	XXSLDWIs,
	XXSPLTIB,
	XXSPLTW,
	XXSPLTWs,
	XXPERMDI,
	XXPERMDIs,
	VADDCUQ,
	VADDECUQ,
	VADDEUQM,
	VADDUQM,
	VMUL10CUQ,
	VMUL10ECUQ,
	VMUL10EUQ,
	VMUL10UQ,
	VSUBCUQ,
	VSUBECUQ,
	VSUBEUQM,
	VSUBUQM,
	XSCMPEXPQP,
	XSCMPOQP,
	XSCMPUQP,
	XSTSTDCQP,
	XSXSIGQP,
	BCDCFN_rec,
	BCDCFZ_rec,
	BCDCPSGN_rec,
	BCDCTN_rec,
	BCDCTZ_rec,
	BCDSETSGN_rec,
	BCDS_rec,
	BCDTRUNC_rec,
	BCDUS_rec,
	BCDUTRUNC_rec
	)>;

	// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	BCDSR_rec,
	XSADDQP,
	XSADDQPO,
	XSCVDPQP,
	XSCVQPDP,
	XSCVQPDPO,
	XSCVQPSDZ,
	XSCVQPSWZ,
	XSCVQPUDZ,
	XSCVQPUWZ,
	XSCVSDQP,
	XSCVUDQP,
	XSRQPI,
	XSRQPIX,
	XSRQPXP,
	XSSUBQP,
	XSSUBQPO
	)>;

	// 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	BCDCTSQ_rec
	)>;

	// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	XSMADDQP,
	XSMADDQPO,
	XSMSUBQP,
	XSMSUBQPO,
	XSMULQP,
	XSMULQPO,
	XSNMADDQP,
	XSNMADDQPO,
	XSNMSUBQP,
	XSNMSUBQPO
	)>;

	// 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	BCDCFSQ_rec
	)>;

	// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	XSDIVQP,
	XSDIVQPO
	)>;

	// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
	// dispatches.
	def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
	(instrs
	XSSQRTQP,
	XSSQRTQPO
	)>;

	// 6 Cycle Load uses a single slice.
	def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C],
	(instrs
	(instregex "LXVL(L)?")
	)>;

	// 5 Cycle Load uses a single slice.
	def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C],
	(instrs
	(instregex "LVE(B\|H\|W)X$"),
	(instregex "LVX(L)?"),
	(instregex "LXSI(B\|H)ZX$"),
	LXSDX,
	LXVB16X,
	LXVD2X,
	LXVWSX,
	LXSIWZX,
	LXV,
	LXVX,
	LXSD,
	DFLOADf64,
	XFLOADf64,
	LIWZX
	)>;

	// 4 Cycle Load uses a single slice.
	def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C],
	(instrs
	(instregex "DCB(F\|T\|ST)(EP)?$"),
	(instregex "DCBZ(L)?(EP)?$"),
	(instregex "DCBTST(EP)?$"),
	(instregex "CP_COPY(8)?$"),
	(instregex "CP_PASTE(8)?$"),
	(instregex "ICBI(EP)?$"),
	(instregex "ICBT(LS)?$"),
	(instregex "LBARX(L)?$"),
	(instregex "LBZ(CIX\|8\|X\|X8\|XTLS\|XTLS_32)?(_)?$"),
	(instregex "LD(ARX\|ARXL\|BRX\|CIX\|X\|XTLS)?(_)?$"),
	(instregex "LH(A\|B)RX(L)?(8)?$"),
	(instregex "LHZ(8\|CIX\|X\|X8\|XTLS\|XTLS_32)?(_)?$"),
	(instregex "LWARX(L)?$"),
	(instregex "LWBRX(8)?$"),
	(instregex "LWZ(8\|CIX\|X\|X8\|XTLS\|XTLS_32)?(_)?$"),
	CP_ABORT,
	DARN,
	EnforceIEIO,
	ISYNC,
	MSGSYNC,
	TLBSYNC,
	SYNC,
	LMW,
	LSWI
	)>;

	// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
	// superslice.
	def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_3SLOTS_1C],
	(instrs
	LFIWZX,
	LFDX,
	LFD
	)>;

	// Cracked Load Instructions.
	// Load instructions that can be done in parallel.
	def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C,
	DISP_PAIR_1C],
	(instrs
	SLBIA,
	SLBIE,
	SLBMFEE,
	SLBMFEV,
	SLBMTE,
	TLBIEL
	)>;

	// Cracked Load Instruction.
	// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
	// operations can be run in parallel.
	def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
	DISP_PAIR_1C, DISP_PAIR_1C],
	(instrs
	(instregex "L(W\|H)ZU(X)?(8)?$")
	)>;

	// Cracked TEND Instruction.
	// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
	// operations can be run in parallel.
	def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
	DISP_1C, DISP_1C],
	(instrs
	TEND
	)>;


	// Cracked Store Instruction
	// Consecutive Store and ALU instructions. The store is restricted and requires
	// three dispatches.
	def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "ST(B\|H\|W\|D)CX$")
	)>;

	// Cracked Load Instruction.
	// Two consecutive load operations for a total of 8 cycles.
	def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C,
	DISP_1C, DISP_1C],
	(instrs
	LDMX
	)>;

	// Cracked Load instruction.
	// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
	// operations cannot be done at the same time and so their latencies are added.
	def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
	DISP_1C, DISP_1C],
	(instrs
	(instregex "LHA(X)?(8)?$"),
	(instregex "CP_PASTE(8)?_rec$"),
	(instregex "LWA(X)?(_32)?$"),
	TCHECK
	)>;

	// Cracked Restricted Load instruction.
	// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
	// operations cannot be done at the same time and so their latencies are added.
	// Full 6 dispatches are required as this is both cracked and restricted.
	def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C],
	(instrs
	LFIWAX
	)>;

	// Cracked Load instruction.
	// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
	// operations cannot be done at the same time and so their latencies are added.
	// Full 4 dispatches are required as this is a cracked instruction.
	def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
	(instrs
	LXSIWAX,
	LIWAX
	)>;

	// Cracked Load instruction.
	// Requires consecutive Load (4 cycles) and ALU (3 cycles) pieces totaling 7
	// cycles. The Load and ALU operations cannot be done at the same time and so
	// their latencies are added.
	// Full 6 dispatches are required as this is a restricted instruction.
	def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C],
	(instrs
	LFSX,
	LFS
	)>;

	// Cracked Load instruction.
	// Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU
	// operations cannot be done at the same time and so their latencies are added.
	// Full 4 dispatches are required as this is a cracked instruction.
	def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
	(instrs
	LXSSP,
	LXSSPX,
	XFLOADf32,
	DFLOADf32
	)>;

	// Cracked 3-Way Load Instruction
	// Load with two ALU operations that depend on each other
	def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_PAIR_1C, DISP_PAIR_1C, DISP_1C],
	(instrs
	(instregex "LHAU(X)?(8)?$"),
	LWAUX
	)>;

	// Cracked Load that requires the PM resource.
	// Since the Load and the PM cannot be done at the same time the latencies are
	// added. Requires 8 cycles. Since the PM requires the full superslice we need
	// both EXECE, EXECO pipelines as well as 1 dispatch for the PM. The Load
	// requires the remaining 1 dispatch.
	def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
	DISP_1C, DISP_1C],
	(instrs
	LXVH8X,
	LXVDSX,
	LXVW4X
	)>;

	// Single slice Restricted store operation. The restricted operation requires
	// all three dispatches for the superslice.
	def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "STF(S\|D\|IWX\|SX\|DX)$"),
	(instregex "STXS(D\|DX\|SPX\|IWX\|IBX\|IHX\|SP)(v)?$"),
	(instregex "STW(8)?$"),
	(instregex "(D\|X)FSTORE(f32\|f64)$"),
	(instregex "ST(W\|H\|D)BRX$"),
	(instregex "ST(B\|H\|D)(8)?$"),
	(instregex "ST(B\|W\|H\|D)(CI)?X(TLS\|TLS_32)?(8)?(_)?$"),
	STIWX,
	SLBIEG,
	STMW,
	STSWI,
	TLBIE
	)>;

	// Vector Store Instruction
	// Requires the whole superslice and therefore requires one dispatch
	// as well as both the Even and Odd exec pipelines.
	def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, DISP_1C],
	(instrs
	(instregex "STVE(B\|H\|W)X$"),
	(instregex "STVX(L)?$"),
	(instregex "STXV(B16X\|H8X\|W4X\|D2X\|L\|LL\|X)?$")
	)>;

	// 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and two
	// dispatches.
	def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
	(instrs
	(instregex "MTCTR(8)?(loop)?$"),
	(instregex "MTLR(8)?$")
	)>;

	// 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and two
	// dispatches.
	def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
	(instrs
	(instregex "M(T\|F)VRSAVE(v)?$"),
	(instregex "M(T\|F)PMR$"),
	(instregex "M(T\|F)TB(8)?$"),
	(instregex "MF(SPR\|CTR\|LR)(8)?$"),
	(instregex "M(T\|F)MSR(D)?$"),
	(instregex "MTSPR(8)?$")
	)>;

	// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and two
	// dispatches.
	def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
	(instrs
	DIVW,
	DIVWO,
	DIVWU,
	DIVWUO,
	MODSW
	)>;

	// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and two
	// dispatches.
	def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
	(instrs
	DIVWE,
	DIVWEO,
	DIVD,
	DIVDO,
	DIVWEU,
	DIVWEUO,
	DIVDU,
	DIVDUO,
	MODSD,
	MODUD,
	MODUW
	)>;

	// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
	// dispatches.
	def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
	(instrs
	DIVDE,
	DIVDEO,
	DIVDEU,
	DIVDEUO
	)>;

	// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
	// and one full superslice for the DIV operation since there is only one DIV per
	// superslice. Latency of DIV plus ALU is 26.
	def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
	DISP_EVEN_1C, DISP_1C],
	(instrs
	(instregex "DIVW(U)?(O)?_rec$")
	)>;

	// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
	// and one full superslice for the DIV operation since there is only one DIV per
	// superslice. Latency of DIV plus ALU is 26.
	def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
	DISP_EVEN_1C, DISP_1C],
	(instrs
	DIVD_rec,
	DIVDO_rec,
	DIVDU_rec,
	DIVDUO_rec,
	DIVWE_rec,
	DIVWEO_rec,
	DIVWEU_rec,
	DIVWEUO_rec
	)>;

	// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
	// and one full superslice for the DIV operation since there is only one DIV per
	// superslice. Latency of DIV plus ALU is 42.
	def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
	DISP_EVEN_1C, DISP_1C],
	(instrs
	DIVDE_rec,
	DIVDEO_rec,
	DIVDEU_rec,
	DIVDEUO_rec
	)>;

	// CR access instructions in _BrMCR, IIC_BrMCRX.

	// Cracked, restricted, ALU operations.
	// Here the two ALU ops can actually be done in parallel and therefore the
	// latencies are not added together. Otherwise this is like having two
	// instructions running together on two pipelines and 6 dispatches. ALU ops are
	// 2 cycles each.
	def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C],
	(instrs
	MTCRF,
	MTCRF8
	)>;

	// Cracked ALU operations.
	// Here the two ALU ops can actually be done in parallel and therefore the
	// latencies are not added together. Otherwise this is like having two
	// instructions running together on two pipelines and 2 dispatches. ALU ops are
	// 2 cycles each.
	def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_1C, DISP_1C],
	(instrs
	(instregex "ADDC(8)?(O)?_rec$"),
	(instregex "SUBFC(8)?(O)?_rec$")
	)>;

	// Cracked ALU operations.
	// Two ALU ops can be done in parallel.
	// One is three cycle ALU the ohter is a two cycle ALU.
	// One of the ALU ops is restricted the other is not so we have a total of
	// 5 dispatches.
	def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "F(N)?ABS(D\|S)_rec$"),
	(instregex "FCPSGN(D\|S)_rec$"),
	(instregex "FNEG(D\|S)_rec$"),
	FMR_rec
	)>;

	// Cracked ALU operations.
	// Here the two ALU ops can actually be done in parallel and therefore the
	// latencies are not added together. Otherwise this is like having two
	// instructions running together on two pipelines and 2 dispatches.
	// ALU ops are 3 cycles each.
	def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_1C, DISP_1C],
	(instrs
	MCRFS
	)>;

	// Cracked Restricted ALU operations.
	// Here the two ALU ops can actually be done in parallel and therefore the
	// latencies are not added together. Otherwise this is like having two
	// instructions running together on two pipelines and 6 dispatches.
	// ALU ops are 3 cycles each.
	def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "MTFSF(b\|_rec)?$"),
	(instregex "MTFSFI(_rec)?$")
	)>;

	// Cracked instruction made of two ALU ops.
	// The two ops cannot be done in parallel.
	// One of the ALU ops is restricted and takes 3 dispatches.
	def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "RLD(I)?C(R\|L)_rec$"),
	(instregex "RLW(IMI\|INM\|NM)(8)?_rec$"),
	(instregex "SLW(8)?_rec$"),
	(instregex "SRAW(I)?_rec$"),
	(instregex "SRW(8)?_rec$"),
	RLDICL_32_rec,
	RLDIMI_rec
	)>;

	// Cracked instruction made of two ALU ops.
	// The two ops cannot be done in parallel.
	// Both of the ALU ops are restricted and take 3 dispatches.
	def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "MFFS(L\|CE\|_rec)?$")
	)>;

	// Cracked ALU instruction composed of three consecutive 2 cycle loads for a
	// total of 6 cycles. All of the ALU operations are also restricted so each
	// takes 3 dispatches for a total of 9.
	def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C],
	(instrs
	(instregex "MFCR(8)?$")
	)>;

	// Cracked instruction made of two ALU ops.
	// The two ops cannot be done in parallel.
	def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
	(instrs
	(instregex "EXTSWSLI_32_64_rec$"),
	(instregex "SRAD(I)?_rec$"),
	EXTSWSLI_rec,
	SLD_rec,
	SRD_rec,
	RLDIC_rec
	)>;

	// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
	def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	FDIV
	)>;

	// 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
	def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	FDIV_rec
	)>;

	// 36 Cycle DP Instruction.
	// Instruction can be done on a single slice.
	def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C],
	(instrs
	XSSQRTDP
	)>;

	// 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
	def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	FSQRT
	)>;

	// 36 Cycle DP Vector Instruction.
	def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C,
	DISP_1C],
	(instrs
	XVSQRTDP
	)>;

	// 27 Cycle DP Vector Instruction.
	def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C,
	DISP_1C],
	(instrs
	XVSQRTSP
	)>;

	// 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
	def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	FSQRT_rec
	)>;

	// 26 Cycle DP Instruction.
	def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C],
	(instrs
	XSSQRTSP
	)>;

	// 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
	def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	FSQRTS
	)>;

	// 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
	def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	FSQRTS_rec
	)>;

	// 33 Cycle DP Instruction. Takes one slice and 1 dispatch.
	def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C],
	(instrs
	XSDIVDP
	)>;

	// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
	def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_3SLOTS_1C],
	(instrs
	FDIVS
	)>;

	// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
	def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	FDIVS_rec
	)>;

	// 22 Cycle DP Instruction. Takes one slice and 1 dispatch.
	def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C],
	(instrs
	XSDIVSP
	)>;

	// 24 Cycle DP Vector Instruction. Takes one full superslice.
	// Includes both EXECE, EXECO pipelines and 1 dispatch for the given
	// superslice.
	def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
	DISP_1C],
	(instrs
	XVDIVSP
	)>;

	// 33 Cycle DP Vector Instruction. Takes one full superslice.
	// Includes both EXECE, EXECO pipelines and 1 dispatch for the given
	// superslice.
	def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
	DISP_1C],
	(instrs
	XVDIVDP
	)>;

	// Instruction cracked into three pieces. One Load and two ALU operations.
	// The Load and one of the ALU ops cannot be run at the same time and so the
	// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
	// Both the load and the ALU that depends on it are restricted and so they take
	// a total of 7 dispatches. The final 2 dispatches come from the second ALU op.
	// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
	def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C,
	IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "LF(SU\|SUX)$")
	)>;

	// Cracked instruction made up of a Store and an ALU. The ALU does not depend on
	// the store and so it can be run at the same time as the store. The store is
	// also restricted.
	def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "STF(S\|D)U(X)?$"),
	(instregex "ST(B\|H\|W\|D)U(X)?(8)?$")
	)>;

	// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
	// the load and so it can be run at the same time as the load.
	def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
	DISP_PAIR_1C, DISP_PAIR_1C],
	(instrs
	(instregex "LBZU(X)?(8)?$"),
	(instregex "LDU(X)?$")
	)>;

	// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
	// the load and so it can be run at the same time as the load. The load is also
	// restricted. 3 dispatches are from the restricted load while the other two
	// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
	// is required for the ALU.
	def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
	DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "LF(DU\|DUX)$")
	)>;

	// Crypto Instructions

	// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
	// superslice. That includes both exec pipelines (EXECO, EXECE) and one
	// dispatch.
	def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
	(instrs
	(instregex "VPMSUM(B\|H\|W\|D)$"),
	(instregex "V(N)?CIPHER(LAST)?$"),
	VSBOX
	)>;

	// Branch Instructions

	// Two Cycle Branch
	def : InstRW<[P9_BR_2C, DISP_BR_1C],
	(instrs
	(instregex "BCCCTR(L)?(8)?$"),
	(instregex "BCCL(A\|R\|RL)?$"),
	(instregex "BCCTR(L)?(8)?(n)?$"),
	(instregex "BD(N)?Z(8\|A\|Am\|Ap\|m\|p)?$"),
	(instregex "BD(N)?ZL(A\|Am\|Ap\|R\|R8\|RL\|RLm\|RLp\|Rm\|Rp\|m\|p)?$"),
	(instregex "BL(_TLS\|_NOP)?$"),
	(instregex "BL8(_TLS\|_NOP\|_NOP_TLS\|_TLS_)?$"),
	(instregex "BLA(8\|8_NOP)?$"),
	(instregex "BLR(8\|L)?$"),
	(instregex "TAILB(A)?(8)?$"),
	(instregex "TAILBCTR(8)?$"),
	(instregex "gBC(A\|Aat\|CTR\|CTRL\|L\|LA\|LAat\|LR\|LRL\|Lat\|at)?$"),
	(instregex "BCLR(L)?(n)?$"),
	(instregex "BCTR(L)?(8)?$"),
	B,
	BA,
	BC,
	BCC,
	BCCA,
	BCL,
	BCLalways,
	BCLn,
	BCTRL8_LDinto_toc,
	BCTRL_LWZinto_toc,
	BCn,
	CTRL_DEP
	)>;

	// Five Cycle Branch with a 2 Cycle ALU Op
	// Operations must be done consecutively and not in parallel.
	def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, DISP_BR_1C, DISP_1C],
	(instrs
	ADDPCIS
	)>;

	// Special Extracted Instructions For Atomics

	// Atomic Load
	def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C,
	IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C,
	IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C,
	DISP_3SLOTS_1C, DISP_1C, DISP_1C, DISP_1C],
	(instrs
	(instregex "L(D\|W)AT$")
	)>;

	// Atomic Store
	def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C,
	IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, DISP_1C],
	(instrs
	(instregex "ST(D\|W)AT$")
	)>;

	// Signal Processing Engine (SPE) Instructions
	// These instructions are not supported on Power 9
	def : InstRW<[],
	(instrs
	BRINC,
	EVABS,
	EVEQV,
	EVMRA,
	EVNAND,
	EVNEG,
	(instregex "EVADD(I)?W$"),
	(instregex "EVADD(SM\|SS\|UM\|US)IAAW$"),
	(instregex "EVAND(C)?$"),
	(instregex "EVCMP(EQ\|GTS\|GTU\|LTS\|LTU)$"),
	(instregex "EVCNTL(S\|Z)W$"),
	(instregex "EVDIVW(S\|U)$"),
	(instregex "EVEXTS(B\|H)$"),
	(instregex "EVLD(H\|W\|D)(X)?$"),
	(instregex "EVLHH(E\|OS\|OU)SPLAT(X)?$"),
	(instregex "EVLWHE(X)?$"),
	(instregex "EVLWHO(S\|U)(X)?$"),
	(instregex "EVLW(H\|W)SPLAT(X)?$"),
	(instregex "EVMERGE(HI\|LO\|HILO\|LOHI)$"),
	(instregex "EVMHEG(S\|U)M(F\|I)A(A\|N)$"),
	(instregex "EVMHES(M\|S)(F\|I)(A\|AA\|AAW\|ANW)?$"),
	(instregex "EVMHEU(M\|S)I(A\|AA\|AAW\|ANW)?$"),
	(instregex "EVMHOG(U\|S)M(F\|I)A(A\|N)$"),
	(instregex "EVMHOS(M\|S)(F\|I)(A\|AA\|AAW\|ANW)?$"),
	(instregex "EVMHOU(M\|S)I(A\|AA\|ANW\|AAW)?$"),
	(instregex "EVMWHS(M\|S)(F\|FA\|I\|IA)$"),
	(instregex "EVMWHUMI(A)?$"),
	(instregex "EVMWLS(M\|S)IA(A\|N)W$"),
	(instregex "EVMWLU(M\|S)I(A\|AA\|AAW\|ANW)?$"),
	(instregex "EVMWSM(F\|I)(A\|AA\|AN)?$"),
	(instregex "EVMWSSF(A\|AA\|AN)?$"),
	(instregex "EVMWUMI(A\|AA\|AN)?$"),
	(instregex "EV(N\|X)?OR(C)?$"),
	(instregex "EVR(LW\|LWI\|NDW)$"),
	(instregex "EVSLW(I)?$"),
	(instregex "EVSPLAT(F)?I$"),
	(instregex "EVSRW(I)?(S\|U)$"),
	(instregex "EVST(DD\|DH\|DW\|WHE\|WHO\|WWE\|WWO)(X)?$"),
	(instregex "EVSUBF(S\|U)(M\|S)IAAW$"),
	(instregex "EVSUB(I)?FW$")
	)> { let Unsupported = 1; }

	// General Instructions without scheduling support.
	def : InstRW<[],
	(instrs
	(instregex "(H)?RFI(D)?$"),
	(instregex "DSS(ALL)?$"),
	(instregex "DST(ST)?(T)?(64)?$"),
	(instregex "ICBL(C\|Q)$"),
	(instregex "L(W\|H\|B)EPX$"),
	(instregex "ST(W\|H\|B)EPX$"),
	(instregex "(L\|ST)FDEPX$"),
	(instregex "M(T\|F)SR(IN)?$"),
	(instregex "M(T\|F)DCR$"),
	(instregex "NOP_GT_PWR(6\|7)$"),
	(instregex "TLB(IA\|IVAX\|SX\|SX2\|SX2D\|LD\|LI\|RE\|RE2\|WE\|WE2)$"),
	(instregex "WRTEE(I)?$"),
	ATTN,
	CLRBHRB,
	MFBHRBE,
	MBAR,
	MSYNC,
	SLBSYNC,
	SLBFEE_rec,
	NAP,
	STOP,
	TRAP,
	RFCI,
	RFDI,
	RFMCI,
	SC,
	DCBA,
	DCBI,
	DCCCI,
	ICCCI
	)> { let Unsupported = 1; }
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td (revision 362609)
	@@ -1,561 +1,565 @@
	//===-- PPC.td - Describe the PowerPC Target Machine -------- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This is the top level entry point for the PowerPC target.
	//
	//===----------------------------------------------------------------------===//

	// Get the target-independent interfaces which we are implementing.
	//
	include "llvm/Target/Target.td"

	//===----------------------------------------------------------------------===//
	// PowerPC Subtarget features.
	//

	//===----------------------------------------------------------------------===//
	// CPU Directives //
	//===----------------------------------------------------------------------===//

	def Directive440 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_440", "">;
	def Directive601 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_601", "">;
	def Directive602 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_602", "">;
	def Directive603 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_603", "">;
	def Directive604 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_603", "">;
	def Directive620 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_603", "">;
	def Directive7400: SubtargetFeature<"", "CPUDirective", "PPC::DIR_7400", "">;
	def Directive750 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_750", "">;
	def Directive970 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_970", "">;
	def Directive32 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_32", "">;
	def Directive64 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_64", "">;
	def DirectiveA2 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_A2", "">;
	def DirectiveE500 : SubtargetFeature<"", "CPUDirective",
	"PPC::DIR_E500", "">;
	def DirectiveE500mc : SubtargetFeature<"", "CPUDirective",
	"PPC::DIR_E500mc", "">;
	def DirectiveE5500 : SubtargetFeature<"", "CPUDirective",
	"PPC::DIR_E5500", "">;
	def DirectivePwr3: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR3", "">;
	def DirectivePwr4: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR4", "">;
	def DirectivePwr5: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR5", "">;
	def DirectivePwr5x
	: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR5X", "">;
	def DirectivePwr6: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR6", "">;
	def DirectivePwr6x
	: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR6X", "">;
	def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
	def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
	def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
	def DirectivePwrFuture
	: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;

	def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true",
	"Enable 64-bit instructions">;
	def FeatureHardFloat : SubtargetFeature<"hard-float", "HasHardFloat", "true",
	"Enable floating-point instructions">;
	def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
	"Enable 64-bit registers usage for ppc32 [beta]">;
	def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true",
	"Use condition-register bits individually">;
	def FeatureFPU : SubtargetFeature<"fpu","HasFPU","true",
	"Enable classic FPU instructions",
	[FeatureHardFloat]>;
	def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true",
	"Enable Altivec instructions",
	[FeatureFPU]>;
	def FeatureSPE : SubtargetFeature<"spe","HasSPE", "true",
	"Enable SPE instructions",
	[FeatureHardFloat]>;
	def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
	"Enable the MFOCRF instruction">;
	def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true",
	"Enable the fsqrt instruction",
	[FeatureFPU]>;
	def FeatureFCPSGN : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
	"Enable the fcpsgn instruction",
	[FeatureFPU]>;
	def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true",
	"Enable the fre instruction",
	[FeatureFPU]>;
	def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true",
	"Enable the fres instruction",
	[FeatureFPU]>;
	def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
	"Enable the frsqrte instruction",
	[FeatureFPU]>;
	def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
	"Enable the frsqrtes instruction",
	[FeatureFPU]>;
	def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
	"Assume higher precision reciprocal estimates">;
	def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
	"Enable the stfiwx instruction",
	[FeatureFPU]>;
	def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
	"Enable the lfiwax instruction",
	[FeatureFPU]>;
	def FeatureFPRND : SubtargetFeature<"fprnd", "HasFPRND", "true",
	"Enable the fri[mnpz] instructions",
	[FeatureFPU]>;
	def FeatureFPCVT : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
	"Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions",
	[FeatureFPU]>;
	def FeatureISEL : SubtargetFeature<"isel","HasISEL", "true",
	"Enable the isel instruction">;
	def FeatureBPERMD : SubtargetFeature<"bpermd", "HasBPERMD", "true",
	"Enable the bpermd instruction">;
	def FeatureExtDiv : SubtargetFeature<"extdiv", "HasExtDiv", "true",
	"Enable extended divide instructions">;
	def FeatureLDBRX : SubtargetFeature<"ldbrx","HasLDBRX", "true",
	"Enable the ldbrx instruction">;
	def FeatureCMPB : SubtargetFeature<"cmpb", "HasCMPB", "true",
	"Enable the cmpb instruction">;
	def FeatureICBT : SubtargetFeature<"icbt","HasICBT", "true",
	"Enable icbt instruction">;
	def FeatureBookE : SubtargetFeature<"booke", "IsBookE", "true",
	"Enable Book E instructions",
	[FeatureICBT]>;
	def FeatureMSYNC : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
	"Has only the msync instruction instead of sync",
	[FeatureBookE]>;
	def FeatureE500 : SubtargetFeature<"e500", "IsE500", "true",
	"Enable E500/E500mc instructions">;
	def FeatureSecurePlt : SubtargetFeature<"secure-plt","SecurePlt", "true",
	"Enable secure plt mode">;
	def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
	"Enable PPC 4xx instructions">;
	def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
	"Enable PPC 6xx instructions">;
	def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true",
	"Enable QPX instructions",
	[FeatureFPU]>;
	def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true",
	"Enable VSX instructions",
	[FeatureAltivec]>;
	def FeatureTwoConstNR :
	SubtargetFeature<"two-const-nr", "NeedsTwoConstNR", "true",
	"Requires two constant Newton-Raphson computation">;
	def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true",
	"Enable POWER8 Altivec instructions",
	[FeatureAltivec]>;
	def FeatureP8Crypto : SubtargetFeature<"crypto", "HasP8Crypto", "true",
	"Enable POWER8 Crypto instructions",
	[FeatureP8Altivec]>;
	def FeatureP8Vector : SubtargetFeature<"power8-vector", "HasP8Vector", "true",
	"Enable POWER8 vector instructions",
	[FeatureVSX, FeatureP8Altivec]>;
	def FeatureDirectMove :
	SubtargetFeature<"direct-move", "HasDirectMove", "true",
	"Enable Power8 direct move instructions",
	[FeatureVSX]>;
	def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics",
	"HasPartwordAtomics", "true",
	"Enable l[bh]arx and st[bh]cx.">;
	def FeatureInvariantFunctionDescriptors :
	SubtargetFeature<"invariant-function-descriptors",
	"HasInvariantFunctionDescriptors", "true",
	"Assume function descriptors are invariant">;
	def FeatureLongCall : SubtargetFeature<"longcall", "UseLongCalls", "true",
	"Always use indirect calls">;
	def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
	"Enable Hardware Transactional Memory instructions">;
	def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true",
	"Implement mftb using the mfspr instruction">;
	+def FeatureUnalignedFloats :
	+ SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
	+ "true", "CPU does not trap on unaligned FP access">;
	def FeaturePPCPreRASched:
	SubtargetFeature<"ppc-prera-sched", "UsePPCPreRASchedStrategy", "true",
	"Use PowerPC pre-RA scheduling strategy">;
	def FeaturePPCPostRASched:
	SubtargetFeature<"ppc-postra-sched", "UsePPCPostRASchedStrategy", "true",
	"Use PowerPC post-RA scheduling strategy">;
	def FeatureFloat128 :
	SubtargetFeature<"float128", "HasFloat128", "true",
	"Enable the __float128 data type for IEEE-754R Binary128.",
	[FeatureVSX]>;
	def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD",
	"POPCNTD_Fast",
	"Enable the popcnt[dw] instructions">;
	// Note that for the a2/a2q processor models we should not use popcnt[dw] by
	// default. These processors do support the instructions, but they're
	// microcoded, and the software emulation is about twice as fast.
	def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
	"POPCNTD_Slow",
	"Has slow popcnt[dw] instructions">;

	def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true",
	"Treat vector data stream cache control instructions as deprecated">;

	def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
	"true",
	"Enable instructions added in ISA 3.0.">;
	def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
	"Enable POWER9 Altivec instructions",
	[FeatureISA3_0, FeatureP8Altivec]>;
	def FeatureP9Vector : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
	"Enable POWER9 vector instructions",
	[FeatureISA3_0, FeatureP8Vector,
	FeatureP9Altivec]>;
	// A separate feature for this even though it is equivalent to P9Vector
	// because this is a feature of the implementation rather than the architecture
	// and may go away with future CPU's.
	def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
	"VectorsUseTwoUnits",
	"true",
	"Vectors use two units">;

	// Since new processors generally contain a superset of features of those that
	// came before them, the idea is to make implementations of new processors
	// less error prone and easier to read.
	// Namely:
	// list<SubtargetFeature> P8InheritableFeatures = ...
	// list<SubtargetFeature> FutureProcessorAddtionalFeatures =
	// [ features that Power8 does not support but inheritable ]
	// list<SubtargetFeature> FutureProcessorSpecificFeatures =
	// [ features that Power8 does not support and not inheritable ]
	// list<SubtargetFeature> FutureProcessorInheritableFeatures =
	// !listconcat(P8InheritableFeatures, FutureProcessorAddtionalFeatures)
	// list<SubtargetFeature> FutureProcessorFeatures =
	// !listconcat(FutureProcessorInheritableFeatures,
	// FutureProcessorSpecificFeatures)

	// Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
	// well as providing a single point of definition if the feature set will be
	// used elsewhere.
	def ProcessorFeatures {
	// Power7
	list<SubtargetFeature> P7InheritableFeatures = [DirectivePwr7,
	FeatureAltivec,
	FeatureVSX,
	FeatureMFOCRF,
	FeatureFCPSGN,
	FeatureFSqrt,
	FeatureFRE,
	FeatureFRES,
	FeatureFRSQRTE,
	FeatureFRSQRTES,
	FeatureRecipPrec,
	FeatureSTFIWX,
	FeatureLFIWAX,
	FeatureFPRND,
	FeatureFPCVT,
	FeatureISEL,
	FeaturePOPCNTD,
	FeatureCMPB,
	FeatureLDBRX,
	Feature64Bit,
	/* Feature64BitRegs, */
	FeatureBPERMD,
	FeatureExtDiv,
	FeatureMFTB,
	DeprecatedDST,
	- FeatureTwoConstNR];
	+ FeatureTwoConstNR,
	+ FeatureUnalignedFloats];
	list<SubtargetFeature> P7SpecificFeatures = [];
	list<SubtargetFeature> P7Features =
	!listconcat(P7InheritableFeatures, P7SpecificFeatures);

	// Power8
	list<SubtargetFeature> P8AdditionalFeatures = [DirectivePwr8,
	FeatureP8Altivec,
	FeatureP8Vector,
	FeatureP8Crypto,
	FeatureHTM,
	FeatureDirectMove,
	FeatureICBT,
	FeaturePartwordAtomic];
	list<SubtargetFeature> P8SpecificFeatures = [];
	list<SubtargetFeature> P8InheritableFeatures =
	!listconcat(P7InheritableFeatures, P8AdditionalFeatures);
	list<SubtargetFeature> P8Features =
	!listconcat(P8InheritableFeatures, P8SpecificFeatures);

	// Power9
	list<SubtargetFeature> P9AdditionalFeatures = [DirectivePwr9,
	FeatureP9Altivec,
	FeatureP9Vector,
	FeatureISA3_0];
	// Some features are unique to Power9 and there is no reason to assume
	// they will be part of any future CPUs. One example is the narrower
	// dispatch for vector operations than scalar ones. For the time being,
	// this list also includes scheduling-related features since we do not have
	// enough info to create custom scheduling strategies for future CPUs.
	list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits,
	FeaturePPCPreRASched,
	FeaturePPCPostRASched];
	list<SubtargetFeature> P9InheritableFeatures =
	!listconcat(P8InheritableFeatures, P9AdditionalFeatures);
	list<SubtargetFeature> P9Features =
	!listconcat(P9InheritableFeatures, P9SpecificFeatures);

	// Future
	// For future CPU we assume that all of the existing features from Power 9
	// still exist with the exception of those we know are Power 9 specific.
	list<SubtargetFeature> FutureAdditionalFeatures = [];
	list<SubtargetFeature> FutureSpecificFeatures = [];
	list<SubtargetFeature> FutureInheritableFeatures =
	!listconcat(P9InheritableFeatures, FutureAdditionalFeatures);
	list<SubtargetFeature> FutureFeatures =
	!listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
	}

	// Note: Future features to add when support is extended to more
	// recent ISA levels:
	//
	// DFP p6, p6x, p7 decimal floating-point instructions
	// POPCNTB p5 through p7 popcntb and related instructions

	//===----------------------------------------------------------------------===//
	// Classes used for relation maps.
	//===----------------------------------------------------------------------===//
	// RecFormRel - Filter class used to relate non-record-form instructions with
	// their record-form variants.
	class RecFormRel;

	// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX
	// FMA instruction forms with their corresponding factor-killing forms.
	class AltVSXFMARel {
	bit IsVSXFMAAlt = 0;
	}

	//===----------------------------------------------------------------------===//
	// Relation Map Definitions.
	//===----------------------------------------------------------------------===//

	def getRecordFormOpcode : InstrMapping {
	let FilterClass = "RecFormRel";
	// Instructions with the same BaseName and Interpretation64Bit values
	// form a row.
	let RowFields = ["BaseName", "Interpretation64Bit"];
	// Instructions with the same RC value form a column.
	let ColFields = ["RC"];
	// The key column are the non-record-form instructions.
	let KeyCol = ["0"];
	// Value columns RC=1
	let ValueCols = [["1"]];
	}

	def getNonRecordFormOpcode : InstrMapping {
	let FilterClass = "RecFormRel";
	// Instructions with the same BaseName and Interpretation64Bit values
	// form a row.
	let RowFields = ["BaseName", "Interpretation64Bit"];
	// Instructions with the same RC value form a column.
	let ColFields = ["RC"];
	// The key column are the record-form instructions.
	let KeyCol = ["1"];
	// Value columns are RC=0
	let ValueCols = [["0"]];
	}

	def getAltVSXFMAOpcode : InstrMapping {
	let FilterClass = "AltVSXFMARel";
	// Instructions with the same BaseName value form a row.
	let RowFields = ["BaseName"];
	// Instructions with the same IsVSXFMAAlt value form a column.
	let ColFields = ["IsVSXFMAAlt"];
	// The key column are the (default) addend-killing instructions.
	let KeyCol = ["0"];
	// Value columns IsVSXFMAAlt=1
	let ValueCols = [["1"]];
	}

	//===----------------------------------------------------------------------===//
	// Register File Description
	//===----------------------------------------------------------------------===//

	include "PPCRegisterInfo.td"
	include "PPCSchedule.td"

	//===----------------------------------------------------------------------===//
	// PowerPC processors supported.
	//

	def : Processor<"generic", G3Itineraries, [Directive32, FeatureHardFloat,
	FeatureMFTB]>;
	def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
	FeatureFRES, FeatureFRSQRTE,
	FeatureICBT, FeatureBookE,
	FeatureMSYNC, FeatureMFTB]>;
	def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
	FeatureFRES, FeatureFRSQRTE,
	FeatureICBT, FeatureBookE,
	FeatureMSYNC, FeatureMFTB]>;
	def : Processor<"601", G3Itineraries, [Directive601, FeatureFPU]>;
	def : Processor<"602", G3Itineraries, [Directive602, FeatureFPU,
	FeatureMFTB]>;
	def : Processor<"603", G3Itineraries, [Directive603,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"603e", G3Itineraries, [Directive603,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"603ev", G3Itineraries, [Directive603,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"604", G3Itineraries, [Directive604,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"604e", G3Itineraries, [Directive604,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"620", G3Itineraries, [Directive620,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"750", G4Itineraries, [Directive750,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"g3", G3Itineraries, [Directive750,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;
	def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
	FeatureFRES, FeatureFRSQRTE,
	FeatureMFTB]>;

	def : ProcessorModel<"970", G5Model,
	[Directive970, FeatureAltivec,
	FeatureMFOCRF, FeatureFSqrt,
	FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
	Feature64Bit /, Feature64BitRegs /,
	FeatureMFTB]>;
	def : ProcessorModel<"g5", G5Model,
	[Directive970, FeatureAltivec,
	FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
	FeatureFRES, FeatureFRSQRTE,
	Feature64Bit /, Feature64BitRegs /,
	FeatureMFTB, DeprecatedDST]>;
	def : ProcessorModel<"e500", PPCE500Model,
	[DirectiveE500,
	FeatureICBT, FeatureBookE,
	FeatureISEL, FeatureMFTB, FeatureMSYNC, FeatureSPE]>;
	def : ProcessorModel<"e500mc", PPCE500mcModel,
	[DirectiveE500mc,
	FeatureSTFIWX, FeatureICBT, FeatureBookE,
	FeatureISEL, FeatureMFTB]>;
	def : ProcessorModel<"e5500", PPCE5500Model,
	[DirectiveE5500, FeatureMFOCRF, Feature64Bit,
	FeatureSTFIWX, FeatureICBT, FeatureBookE,
	FeatureISEL, FeatureMFTB]>;
	def : ProcessorModel<"a2", PPCA2Model,
	[DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
	FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
	FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
	FeatureSTFIWX, FeatureLFIWAX,
	FeatureFPRND, FeatureFPCVT, FeatureISEL,
	FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
	Feature64Bit /, Feature64BitRegs /, FeatureMFTB]>;
	def : ProcessorModel<"a2q", PPCA2Model,
	[DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
	FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
	FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
	FeatureSTFIWX, FeatureLFIWAX,
	FeatureFPRND, FeatureFPCVT, FeatureISEL,
	FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
	Feature64Bit /, Feature64BitRegs /, FeatureQPX,
	FeatureMFTB]>;
	def : ProcessorModel<"pwr3", G5Model,
	[DirectivePwr3, FeatureAltivec,
	FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
	FeatureSTFIWX, Feature64Bit]>;
	def : ProcessorModel<"pwr4", G5Model,
	[DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
	FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
	FeatureSTFIWX, Feature64Bit, FeatureMFTB]>;
	def : ProcessorModel<"pwr5", G5Model,
	[DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
	FeatureFSqrt, FeatureFRE, FeatureFRES,
	FeatureFRSQRTE, FeatureFRSQRTES,
	FeatureSTFIWX, Feature64Bit,
	FeatureMFTB, DeprecatedDST]>;
	def : ProcessorModel<"pwr5x", G5Model,
	[DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
	FeatureFSqrt, FeatureFRE, FeatureFRES,
	FeatureFRSQRTE, FeatureFRSQRTES,
	FeatureSTFIWX, FeatureFPRND, Feature64Bit,
	FeatureMFTB, DeprecatedDST]>;
	def : ProcessorModel<"pwr6", G5Model,
	[DirectivePwr6, FeatureAltivec,
	FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
	FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
	FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
	FeatureFPRND, Feature64Bit /, Feature64BitRegs /,
	FeatureMFTB, DeprecatedDST]>;
	def : ProcessorModel<"pwr6x", G5Model,
	[DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
	FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
	FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
	FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
	FeatureFPRND, Feature64Bit,
	FeatureMFTB, DeprecatedDST]>;
	def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
	def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
	def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
	// No scheduler model for future CPU.
	def : ProcessorModel<"future", NoSchedModel,
	ProcessorFeatures.FutureFeatures>;
	def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat,
	FeatureMFTB]>;
	def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat,
	FeatureMFTB]>;
	def : ProcessorModel<"ppc64", G5Model,
	[Directive64, FeatureAltivec,
	FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
	FeatureFRSQRTE, FeatureSTFIWX,
	Feature64Bit /, Feature64BitRegs /,
	FeatureMFTB]>;
	def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.P8Features>;

	//===----------------------------------------------------------------------===//
	// Calling Conventions
	//===----------------------------------------------------------------------===//

	include "PPCCallingConv.td"

	def PPCInstrInfo : InstrInfo {
	let isLittleEndianEncoding = 1;

	// FIXME: Unset this when no longer needed!
	let decodePositionallyEncodedOperands = 1;

	let noNamedPositionallyEncodedOperands = 1;
	}

	def PPCAsmParser : AsmParser {
	let ShouldEmitMatchRegisterName = 0;
	}

	def PPCAsmParserVariant : AsmParserVariant {
	int Variant = 0;

	// We do not use hard coded registers in asm strings. However, some
	// InstAlias definitions use immediate literals. Set RegisterPrefix
	// so that those are not misinterpreted as registers.
	string RegisterPrefix = "%";
	string BreakCharacters = ".";
	}

	def PPC : Target {
	// Information about the instructions.
	let InstructionSet = PPCInstrInfo;

	let AssemblyParsers = [PPCAsmParser];
	let AssemblyParserVariants = [PPCAsmParserVariant];
	let AllowRegisterRenaming = 1;
	}

	//===----------------------------------------------------------------------===//
	// Pfm Counters
	//===----------------------------------------------------------------------===//

	include "PPCPfmCounters.td"
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp (revision 362609)
	@@ -1,15898 +1,15926 @@
	//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the PPCISelLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "PPCISelLowering.h"
	#include "MCTargetDesc/PPCPredicates.h"
	#include "PPC.h"
	#include "PPCCCState.h"
	#include "PPCCallingConv.h"
	#include "PPCFrameLowering.h"
	#include "PPCInstrInfo.h"
	#include "PPCMachineFunctionInfo.h"
	#include "PPCPerfectShuffle.h"
	#include "PPCRegisterInfo.h"
	#include "PPCSubtarget.h"
	#include "PPCTargetMachine.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicsPowerPC.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSymbolXCOFF.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <list>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "ppc-lowering"

	static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
	cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);

	static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
	cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);

	static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
	cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);

	static cl::opt<bool> DisableSCO("disable-ppc-sco",
	cl::desc("disable sibling call optimization on ppc"), cl::Hidden);

	static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
	cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);

	static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
	cl::desc("enable quad precision float support on ppc"), cl::Hidden);

	static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
	cl::desc("use absolute jump tables on ppc"), cl::Hidden);

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumSiblingCalls, "Number of sibling calls");

	static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);

	static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);

	// FIXME: Remove this once the bug has been fixed!
	extern cl::opt<bool> ANDIGlueBug;

	PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
	const PPCSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
	// arguments are at least 4/8 bytes aligned.
	bool isPPC64 = Subtarget.isPPC64();
	setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));

	// Set up the register classes.
	addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
	if (!useSoftFloat()) {
	if (hasSPE()) {
	addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
	addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
	} else {
	addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
	addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
	}
	}

	// Match BITREVERSE to customized fast code sequence in the td file.
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);

	// Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);

	// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
	}

	+ if (Subtarget.isISA3_0()) {
	+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
	+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
	+ setTruncStoreAction(MVT::f64, MVT::f16, Legal);
	+ setTruncStoreAction(MVT::f32, MVT::f16, Legal);
	+ } else {
	+ // No extending loads from f16 or HW conversions back and forth.
	+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	+ }
	+
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// PowerPC has pre-inc load and store's.
	setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
	if (!Subtarget.hasSPE()) {
	setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
	}

	// PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
	for (MVT VT : ScalarIntVTs) {
	setOperationAction(ISD::ADDC, VT, Legal);
	setOperationAction(ISD::ADDE, VT, Legal);
	setOperationAction(ISD::SUBC, VT, Legal);
	setOperationAction(ISD::SUBE, VT, Legal);
	}

	if (Subtarget.useCRBits()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	if (isPPC64 \|\| Subtarget.hasFPCVT()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
	AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
	isPPC64 ? MVT::i64 : MVT::i32);
	setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
	AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
	isPPC64 ? MVT::i64 : MVT::i32);
	} else {
	setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
	}

	// PowerPC does not support direct load/store of condition registers.
	setOperationAction(ISD::LOAD, MVT::i1, Custom);
	setOperationAction(ISD::STORE, MVT::i1, Custom);

	// FIXME: Remove this once the ANDI glue bug is fixed:
	if (ANDIGlueBug)
	setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);

	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
	setTruncStoreAction(VT, MVT::i1, Expand);
	}

	addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
	}

	// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
	// PPC (the libcall is not available).
	setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);

	// We do not currently implement these libm ops for PowerPC.
	setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
	setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
	setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
	setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
	setOperationAction(ISD::FREM, MVT::ppcf128, Expand);

	// PowerPC has no SREM/UREM instructions unless we are on P9
	// On P9 we may use a hardware instruction to compute the remainder.
	// The instructions are not legalized directly because in the cases where the
	// result of both the remainder and the division is required it is more
	// efficient to compute the remainder from the result of the division rather
	// than use the remainder instruction.
	if (Subtarget.isISA3_0()) {
	setOperationAction(ISD::SREM, MVT::i32, Custom);
	setOperationAction(ISD::UREM, MVT::i32, Custom);
	setOperationAction(ISD::SREM, MVT::i64, Custom);
	setOperationAction(ISD::UREM, MVT::i64, Custom);
	} else {
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);
	}

	// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
	setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);

	// We don't support sin/cos/sqrt/fmod/pow
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FREM , MVT::f64, Expand);
	setOperationAction(ISD::FPOW , MVT::f64, Expand);
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	setOperationAction(ISD::FREM , MVT::f32, Expand);
	setOperationAction(ISD::FPOW , MVT::f32, Expand);
	if (Subtarget.hasSPE()) {
	setOperationAction(ISD::FMA , MVT::f64, Expand);
	setOperationAction(ISD::FMA , MVT::f32, Expand);
	} else {
	setOperationAction(ISD::FMA , MVT::f64, Legal);
	setOperationAction(ISD::FMA , MVT::f32, Legal);
	}

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	// If we're enabling GP optimizations, use hardware square root
	if (!Subtarget.hasFSQRT() &&
	!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
	Subtarget.hasFRE()))
	setOperationAction(ISD::FSQRT, MVT::f64, Expand);

	if (!Subtarget.hasFSQRT() &&
	!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
	Subtarget.hasFRES()))
	setOperationAction(ISD::FSQRT, MVT::f32, Expand);

	if (Subtarget.hasFCPSGN()) {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
	} else {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
	}

	if (Subtarget.hasFPRND()) {
	setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
	setOperationAction(ISD::FROUND, MVT::f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
	setOperationAction(ISD::FROUND, MVT::f32, Legal);
	}

	// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
	// to speed up scalar BSWAP64.
	// CTPOP or CTTZ were introduced in P8/P9 respectively
	setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
	if (Subtarget.hasP9Vector())
	setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
	else
	setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
	if (Subtarget.isISA3_0()) {
	setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
	setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
	} else {
	setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
	setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
	}

	if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
	setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
	setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
	} else {
	setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
	setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
	}

	// PowerPC does not have ROTR
	setOperationAction(ISD::ROTR, MVT::i32 , Expand);
	setOperationAction(ISD::ROTR, MVT::i64 , Expand);

	if (!Subtarget.useCRBits()) {
	// PowerPC does not have Select
	setOperationAction(ISD::SELECT, MVT::i32, Expand);
	setOperationAction(ISD::SELECT, MVT::i64, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Expand);
	}

	// PowerPC wants to turn select_cc of FP into fsel when possible.
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);

	// PowerPC wants to optimize integer setcc a bit
	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SETCC, MVT::i32, Custom);

	// PowerPC does not have BRCOND which requires SetCC
	if (!Subtarget.useCRBits())
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);

	if (Subtarget.hasSPE()) {
	// SPE has built-in conversions
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
	} else {
	// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

	// PowerPC does not have [U\|S]INT_TO_FP
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
	}

	if (Subtarget.hasDirectMove() && isPPC64) {
	setOperationAction(ISD::BITCAST, MVT::f32, Legal);
	setOperationAction(ISD::BITCAST, MVT::i32, Legal);
	setOperationAction(ISD::BITCAST, MVT::i64, Legal);
	setOperationAction(ISD::BITCAST, MVT::f64, Legal);
	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::LRINT, MVT::f64, Legal);
	setOperationAction(ISD::LRINT, MVT::f32, Legal);
	setOperationAction(ISD::LLRINT, MVT::f64, Legal);
	setOperationAction(ISD::LLRINT, MVT::f32, Legal);
	setOperationAction(ISD::LROUND, MVT::f64, Legal);
	setOperationAction(ISD::LROUND, MVT::f32, Legal);
	setOperationAction(ISD::LLROUND, MVT::f64, Legal);
	setOperationAction(ISD::LLROUND, MVT::f32, Legal);
	}
	} else {
	setOperationAction(ISD::BITCAST, MVT::f32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i64, Expand);
	setOperationAction(ISD::BITCAST, MVT::f64, Expand);
	}

	// We cannot sextinreg(i1). Expand to shifts.
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
	// SjLj exception handling but a light-weight setjmp/longjmp replacement to
	// support continuation, user-level threading, and etc.. As a result, no
	// other SjLj exception interfaces are implemented and please don't build
	// your own exception handling based on them.
	// LLVM/Clang supports zero-cost DWARF exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

	// We want to legalize GlobalAddress and ConstantPool nodes into the
	// appropriate instructions to materialize the address.
	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
	setOperationAction(ISD::JumpTable, MVT::i32, Custom);
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	// TRAP is legal.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	// TRAMPOLINE is custom lowered.
	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);

	if (Subtarget.is64BitELFABI()) {
	// VAARG always uses double-word chunks, so promote anything smaller.
	setOperationAction(ISD::VAARG, MVT::i1, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i8, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i16, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i32, Promote);
	AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	} else if (Subtarget.is32BitELFABI()) {
	// VAARG is custom lowered with the 32-bit SVR4 ABI.
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::i64, Custom);
	} else
	setOperationAction(ISD::VAARG, MVT::Other, Expand);

	// VACOPY is custom lowered with the 32-bit SVR4 ABI.
	if (Subtarget.is32BitELFABI())
	setOperationAction(ISD::VACOPY , MVT::Other, Custom);
	else
	setOperationAction(ISD::VACOPY , MVT::Other, Expand);

	// Use the default implementation.
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	// To handle counter-based loop conditions.
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);

	setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

	// Comparisons that require checking two conditions.
	if (Subtarget.hasSPE()) {
	setCondCodeAction(ISD::SETO, MVT::f32, Expand);
	setCondCodeAction(ISD::SETO, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
	}
	setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
	setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f64, Expand);

	if (Subtarget.has64BitSupport()) {
	// They also have instructions for converting between i64 and fp.
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
	// This is just the low 32 bits of a (signed) fp->i64 conversion.
	// We cannot do this with Promote because i64 is not a legal type.
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

	if (Subtarget.hasLFIWAX() \|\| Subtarget.isPPC64())
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	} else {
	// PowerPC does not have FP_TO_UINT on 32-bit implementations.
	if (Subtarget.hasSPE())
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
	else
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
	}

	// With the instructions enabled under FPCVT, we can do everything.
	if (Subtarget.hasFPCVT()) {
	if (Subtarget.has64BitSupport()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	}

	if (Subtarget.use64BitRegs()) {
	// 64-bit PowerPC implementations can support i64 types directly
	addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
	// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
	// 64-bit PowerPC wants to expand i128 shifts itself.
	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
	} else {
	// 32-bit PowerPC wants to expand i64 shifts itself.
	setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
	}

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
	setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
	setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
	}

	if (Subtarget.hasAltivec()) {
	// First set operation action for all vector types to expand. Then we
	// will selectively turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	// add/sub are legal for all supported vector VT's.
	setOperationAction(ISD::ADD, VT, Legal);
	setOperationAction(ISD::SUB, VT, Legal);

	// For v2i64, these are only valid with P8Vector. This is corrected after
	// the loop.
	if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	}
	else {
	setOperationAction(ISD::SMAX, VT, Expand);
	setOperationAction(ISD::SMIN, VT, Expand);
	setOperationAction(ISD::UMAX, VT, Expand);
	setOperationAction(ISD::UMIN, VT, Expand);
	}

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::FMAXNUM, VT, Legal);
	setOperationAction(ISD::FMINNUM, VT, Legal);
	}

	// Vector instructions introduced in P8
	if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
	setOperationAction(ISD::CTPOP, VT, Legal);
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	else {
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	}

	// Vector instructions introduced in P9
	if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
	setOperationAction(ISD::CTTZ, VT, Legal);
	else
	setOperationAction(ISD::CTTZ, VT, Expand);

	// We promote all shuffles to v16i8.
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
	AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);

	// We promote all non-typed operations to v4i32.
	setOperationAction(ISD::AND , VT, Promote);
	AddPromotedToType (ISD::AND , VT, MVT::v4i32);
	setOperationAction(ISD::OR , VT, Promote);
	AddPromotedToType (ISD::OR , VT, MVT::v4i32);
	setOperationAction(ISD::XOR , VT, Promote);
	AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
	setOperationAction(ISD::LOAD , VT, Promote);
	AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
	setOperationAction(ISD::SELECT, VT, Promote);
	AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
	setOperationAction(ISD::VSELECT, VT, Legal);
	setOperationAction(ISD::SELECT_CC, VT, Promote);
	AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType (ISD::STORE, VT, MVT::v4i32);

	// No other operations are legal.
	setOperationAction(ISD::MUL , VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::FDIV, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FNEG, VT, Expand);
	setOperationAction(ISD::FSQRT, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FABS, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);

	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}
	+ setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
	if (!Subtarget.hasP8Vector()) {
	setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
	setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
	setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
	setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
	}

	for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
	setOperationAction(ISD::ABS, VT, Custom);

	// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
	// with merges, splats, etc.
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);

	// Vector truncates to sub-word integer that fit in an Altivec/VSX register
	// are cheap, so handle them before they get expanded to scalar.
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

	setOperationAction(ISD::AND , MVT::v4i32, Legal);
	setOperationAction(ISD::OR , MVT::v4i32, Legal);
	setOperationAction(ISD::XOR , MVT::v4i32, Legal);
	setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
	setOperationAction(ISD::SELECT, MVT::v4i32,
	Subtarget.useCRBits() ? Legal : Expand);
	setOperationAction(ISD::STORE , MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);

	// Without hasP8Altivec set, v2i64 SMAX isn't available.
	// But ABS custom lowering requires SMAX support.
	if (!Subtarget.hasP8Altivec())
	setOperationAction(ISD::ABS, MVT::v2i64, Expand);

	// With hasAltivec set, we can lower ISD::ROTL to vrl(b\|h\|w).
	if (Subtarget.hasAltivec())
	for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
	setOperationAction(ISD::ROTL, VT, Legal);
	// With hasP8Altivec set, we can lower ISD::ROTL to vrld.
	if (Subtarget.hasP8Altivec())
	setOperationAction(ISD::ROTL, MVT::v2i64, Legal);

	addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);

	setOperationAction(ISD::MUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FMA, MVT::v4f32, Legal);

	if (TM.Options.UnsafeFPMath \|\| Subtarget.hasVSX()) {
	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	}

	if (Subtarget.hasP8Altivec())
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);
	else
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v16i8, Custom);

	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);

	setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

	// Altivec does not contain unordered floating-point compare instructions
	setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
	if (Subtarget.hasP8Vector()) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
	}
	if (Subtarget.hasDirectMove() && isPPC64) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
	}
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);

	// The nearbyint variants are not allowed to raise the inexact exception
	// so we can only code-gen them with unsafe math.
	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
	}

	setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
	setOperationAction(ISD::FROUND, MVT::f64, Legal);

	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::f32, Legal);

	setOperationAction(ISD::MUL, MVT::v2f64, Legal);
	setOperationAction(ISD::FMA, MVT::v2f64, Legal);

	setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);

	// Share the Altivec comparison restrictions.
	setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);

	setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
	setOperationAction(ISD::STORE, MVT::v2f64, Legal);

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);

	if (Subtarget.hasP8Vector())
	addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);

	addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);

	addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
	addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
	addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);

	if (Subtarget.hasP8Altivec()) {
	setOperationAction(ISD::SHL, MVT::v2i64, Legal);
	setOperationAction(ISD::SRA, MVT::v2i64, Legal);
	setOperationAction(ISD::SRL, MVT::v2i64, Legal);

	// 128 bit shifts can be accomplished via 3 instructions for SHL and
	// SRL, but not for SRA because of the instructions available:
	// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
	// doing
	setOperationAction(ISD::SHL, MVT::v1i128, Expand);
	setOperationAction(ISD::SRL, MVT::v1i128, Expand);
	setOperationAction(ISD::SRA, MVT::v1i128, Expand);

	setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
	}
	else {
	setOperationAction(ISD::SHL, MVT::v2i64, Expand);
	setOperationAction(ISD::SRA, MVT::v2i64, Expand);
	setOperationAction(ISD::SRL, MVT::v2i64, Expand);

	setOperationAction(ISD::SETCC, MVT::v2i64, Custom);

	// VSX v2i64 only supports non-arithmetic operations.
	setOperationAction(ISD::ADD, MVT::v2i64, Expand);
	setOperationAction(ISD::SUB, MVT::v2i64, Expand);
	}

	setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
	AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
	setOperationAction(ISD::STORE, MVT::v2i64, Promote);
	AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);

	// Custom handling for partial vectors of integers converted to
	// floating point. We already have optimal handling for v2i32 through
	// the DAG combine, so those aren't necessary.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);

	setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
	setOperationAction(ISD::FABS, MVT::v4f32, Legal);
	setOperationAction(ISD::FABS, MVT::v2f64, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);

	if (Subtarget.hasDirectMove())
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);

	addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
	}

	if (Subtarget.hasP8Altivec()) {
	addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
	}

	if (Subtarget.hasP9Vector()) {
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	// 128 bit shifts can be accomplished via 3 instructions for SHL and
	// SRL, but not for SRA because of the instructions available:
	// VS{RL} and VS{RL}O.
	setOperationAction(ISD::SHL, MVT::v1i128, Legal);
	setOperationAction(ISD::SRL, MVT::v1i128, Legal);
	setOperationAction(ISD::SRA, MVT::v1i128, Expand);

	if (EnableQuadPrecision) {
	addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
	setOperationAction(ISD::FADD, MVT::f128, Legal);
	setOperationAction(ISD::FSUB, MVT::f128, Legal);
	setOperationAction(ISD::FDIV, MVT::f128, Legal);
	setOperationAction(ISD::FMUL, MVT::f128, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
	// No extending loads to f128 on PPC.
	for (MVT FPT : MVT::fp_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
	setOperationAction(ISD::FMA, MVT::f128, Legal);
	setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f128, Expand);

	setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
	setOperationAction(ISD::FRINT, MVT::f128, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
	setOperationAction(ISD::FCEIL, MVT::f128, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
	setOperationAction(ISD::FROUND, MVT::f128, Legal);

	setOperationAction(ISD::SELECT, MVT::f128, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i128, Custom);
	// No implementation for these ops for PowerPC.
	setOperationAction(ISD::FSIN , MVT::f128, Expand);
	setOperationAction(ISD::FCOS , MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FPOWI, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	}
	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
	setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
	setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
	setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
	}

	if (Subtarget.hasP9Altivec()) {
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
	}
	}

	if (Subtarget.hasQPX()) {
	setOperationAction(ISD::FADD, MVT::v4f64, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
	setOperationAction(ISD::FREM, MVT::v4f64, Expand);

	setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
	setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);

	setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
	setOperationAction(ISD::STORE , MVT::v4f64, Custom);

	setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
	setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);

	setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

	setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
	setOperationAction(ISD::FABS , MVT::v4f64, Legal);
	setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
	setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
	setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
	setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
	setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);

	setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);

	setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);

	addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);

	setOperationAction(ISD::FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FREM, MVT::v4f32, Expand);

	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
	setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);

	setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
	setOperationAction(ISD::STORE , MVT::v4f32, Custom);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
	setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);

	setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
	setOperationAction(ISD::FABS , MVT::v4f32, Legal);
	setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
	setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
	setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);

	setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);

	setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);

	addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);

	setOperationAction(ISD::AND , MVT::v4i1, Legal);
	setOperationAction(ISD::OR , MVT::v4i1, Legal);
	setOperationAction(ISD::XOR , MVT::v4i1, Legal);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);

	setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
	setOperationAction(ISD::STORE , MVT::v4i1, Custom);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);

	addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);

	setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);

	// These need to set FE_INEXACT, and so cannot be vectorized here.
	setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f32, Expand);

	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);

	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	} else {
	setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);

	setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
	}
	}

	if (Subtarget.has64BitSupport())
	setOperationAction(ISD::PREFETCH, MVT::Other, Legal);

	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);

	if (!isPPC64) {
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
	}

	setBooleanContents(ZeroOrOneBooleanContent);

	if (Subtarget.hasAltivec()) {
	// Altivec instructions set fields to all zeros or all ones.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
	}

	if (!isPPC64) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::BUILD_VECTOR);
	if (Subtarget.hasFPCVT())
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::BR_CC);
	if (Subtarget.useCRBits())
	setTargetDAGCombine(ISD::BRCOND);
	setTargetDAGCombine(ISD::BSWAP);
	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_VOID);

	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);

	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);


	if (Subtarget.useCRBits()) {
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::SELECT_CC);
	}

	// Use reciprocal estimates.
	if (TM.Options.UnsafeFPMath) {
	setTargetDAGCombine(ISD::FDIV);
	setTargetDAGCombine(ISD::FSQRT);
	}

	if (Subtarget.hasP9Altivec()) {
	setTargetDAGCombine(ISD::ABS);
	setTargetDAGCombine(ISD::VSELECT);
	}

	// Darwin long double math library functions have $LDBL128 appended.
	if (Subtarget.isDarwin()) {
	setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
	setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
	setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
	setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
	setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
	setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
	setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
	setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
	setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
	setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
	}

	if (EnableQuadPrecision) {
	setLibcallName(RTLIB::LOG_F128, "logf128");
	setLibcallName(RTLIB::LOG2_F128, "log2f128");
	setLibcallName(RTLIB::LOG10_F128, "log10f128");
	setLibcallName(RTLIB::EXP_F128, "expf128");
	setLibcallName(RTLIB::EXP2_F128, "exp2f128");
	setLibcallName(RTLIB::SIN_F128, "sinf128");
	setLibcallName(RTLIB::COS_F128, "cosf128");
	setLibcallName(RTLIB::POW_F128, "powf128");
	setLibcallName(RTLIB::FMIN_F128, "fminf128");
	setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
	setLibcallName(RTLIB::POWI_F128, "__powikf2");
	setLibcallName(RTLIB::REM_F128, "fmodf128");
	}

	// With 32 condition bits, we don't need to sink (and duplicate) compares
	// aggressively in CodeGenPrep.
	if (Subtarget.useCRBits()) {
	setHasMultipleConditionRegisters();
	setJumpIsExpensive();
	}

	setMinFunctionAlignment(Align(4));
	if (Subtarget.isDarwin())
	setPrefFunctionAlignment(Align(16));

	switch (Subtarget.getCPUDirective()) {
	default: break;
	case PPC::DIR_970:
	case PPC::DIR_A2:
	case PPC::DIR_E500:
	case PPC::DIR_E500mc:
	case PPC::DIR_E5500:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	case PPC::DIR_PWR9:
	case PPC::DIR_PWR_FUTURE:
	setPrefLoopAlignment(Align(16));
	setPrefFunctionAlignment(Align(16));
	break;
	}

	if (Subtarget.enableMachineScheduler())
	setSchedulingPreference(Sched::Source);
	else
	setSchedulingPreference(Sched::Hybrid);

	computeRegisterProperties(STI.getRegisterInfo());

	// The Freescale cores do better with aggressive inlining of memcpy and
	// friends. GCC uses same threshold of 128 bytes (= 32 word stores).
	if (Subtarget.getCPUDirective() == PPC::DIR_E500mc \|\|
	Subtarget.getCPUDirective() == PPC::DIR_E5500) {
	MaxStoresPerMemset = 32;
	MaxStoresPerMemsetOptSize = 16;
	MaxStoresPerMemcpy = 32;
	MaxStoresPerMemcpyOptSize = 8;
	MaxStoresPerMemmove = 32;
	MaxStoresPerMemmoveOptSize = 8;
	} else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
	// The A2 also benefits from (very) aggressive inlining of memcpy and
	// friends. The overhead of a the function call, even when warm, can be
	// over one hundred cycles.
	MaxStoresPerMemset = 128;
	MaxStoresPerMemcpy = 128;
	MaxStoresPerMemmove = 128;
	MaxLoadsPerMemcmp = 128;
	} else {
	MaxLoadsPerMemcmp = 8;
	MaxLoadsPerMemcmpOptSize = 4;
	}
	}

	/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
	unsigned MaxMaxAlign) {
	if (MaxAlign == MaxMaxAlign)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
	MaxAlign = 32;
	else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == MaxMaxAlign)
	break;
	}
	}
	}

	/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area.
	unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	// Darwin passes everything on 4 byte boundary.
	if (Subtarget.isDarwin())
	return 4;

	// 16byte and wider vectors are passed on 16byte boundary.
	// The rest is 8 on PPC64 and 4 on PPC32 boundary.
	unsigned Align = Subtarget.isPPC64() ? 8 : 4;
	if (Subtarget.hasAltivec() \|\| Subtarget.hasQPX())
	getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
	return Align;
	}

	bool PPCTargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	bool PPCTargetLowering::hasSPE() const {
	return Subtarget.hasSPE();
	}

	bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	return VT.isScalarInteger();
	}

	const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((PPCISD::NodeType)Opcode) {
	case PPCISD::FIRST_NUMBER: break;
	case PPCISD::FSEL: return "PPCISD::FSEL";
	case PPCISD::XSMAXCDP: return "PPCISD::XSMAXCDP";
	case PPCISD::XSMINCDP: return "PPCISD::XSMINCDP";
	case PPCISD::FCFID: return "PPCISD::FCFID";
	case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
	case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
	case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
	case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
	case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
	case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
	case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
	case PPCISD::FP_TO_UINT_IN_VSR:
	return "PPCISD::FP_TO_UINT_IN_VSR,";
	case PPCISD::FP_TO_SINT_IN_VSR:
	return "PPCISD::FP_TO_SINT_IN_VSR";
	case PPCISD::FRE: return "PPCISD::FRE";
	case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
	case PPCISD::STFIWX: return "PPCISD::STFIWX";
	case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
	case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
	case PPCISD::VPERM: return "PPCISD::VPERM";
	case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
	case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
	case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
	case PPCISD::VECSHL: return "PPCISD::VECSHL";
	case PPCISD::CMPB: return "PPCISD::CMPB";
	case PPCISD::Hi: return "PPCISD::Hi";
	case PPCISD::Lo: return "PPCISD::Lo";
	case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
	case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
	case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
	case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
	case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
	case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
	case PPCISD::SRL: return "PPCISD::SRL";
	case PPCISD::SRA: return "PPCISD::SRA";
	case PPCISD::SHL: return "PPCISD::SHL";
	case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
	case PPCISD::CALL: return "PPCISD::CALL";
	case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
	case PPCISD::MTCTR: return "PPCISD::MTCTR";
	case PPCISD::BCTRL: return "PPCISD::BCTRL";
	case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
	case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
	case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
	case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
	case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
	case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
	case PPCISD::MFVSR: return "PPCISD::MFVSR";
	case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
	case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
	case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
	case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
	case PPCISD::ANDI_rec_1_EQ_BIT:
	return "PPCISD::ANDI_rec_1_EQ_BIT";
	case PPCISD::ANDI_rec_1_GT_BIT:
	return "PPCISD::ANDI_rec_1_GT_BIT";
	case PPCISD::VCMP: return "PPCISD::VCMP";
	case PPCISD::VCMPo: return "PPCISD::VCMPo";
	case PPCISD::LBRX: return "PPCISD::LBRX";
	case PPCISD::STBRX: return "PPCISD::STBRX";
	case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
	case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
	case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
	case PPCISD::STXSIX: return "PPCISD::STXSIX";
	case PPCISD::VEXTS: return "PPCISD::VEXTS";
	case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
	case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
	case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
	case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
	case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
	case PPCISD::ST_VSR_SCAL_INT:
	return "PPCISD::ST_VSR_SCAL_INT";
	case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
	case PPCISD::BDNZ: return "PPCISD::BDNZ";
	case PPCISD::BDZ: return "PPCISD::BDZ";
	case PPCISD::MFFS: return "PPCISD::MFFS";
	case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
	case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
	case PPCISD::CR6SET: return "PPCISD::CR6SET";
	case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
	case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
	case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
	case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
	case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
	case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
	case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
	case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
	case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
	case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
	case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
	case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
	case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
	case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
	case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
	case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
	case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
	case PPCISD::SC: return "PPCISD::SC";
	case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
	case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
	case PPCISD::RFEBB: return "PPCISD::RFEBB";
	case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
	case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
	case PPCISD::VABSD: return "PPCISD::VABSD";
	case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
	case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
	case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
	case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
	case PPCISD::QBFLT: return "PPCISD::QBFLT";
	case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
	case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
	case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
	case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
	case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
	case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
	case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
	case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
	}
	return nullptr;
	}

	EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
	EVT VT) const {
	if (!VT.isVector())
	return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;

	if (Subtarget.hasQPX())
	return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());

	return VT.changeVectorElementTypeToInteger();
	}

	bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Node matching predicates, for use by the tblgen matching code.
	//===----------------------------------------------------------------------===//

	/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
	static bool isFloatingPointZero(SDValue Op) {
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
	return CFP->getValueAPF().isZero();
	else if (ISD::isEXTLoad(Op.getNode()) \|\| ISD::isNON_EXTLoad(Op.getNode())) {
	// Maybe this has already been legalized into the constant pool?
	if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
	return CFP->getValueAPF().isZero();
	}
	return false;
	}

	/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
	/// true if Op is undef or if it matches the specified value.
	static bool isConstantOrUndef(int Op, int Val) {
	return Op < 0 \|\| Op == Val;
	}

	/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUHUM instruction.
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 1;
	for (unsigned i = 0; i != 8; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
	return false;
	}
	return true;
	}

	/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUWUM instruction.
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 2;
	for (unsigned i = 0; i != 8; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
	return false;
	}
	return true;
	}

	/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
	/// current subtarget.
	///
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	const PPCSubtarget& Subtarget =
	static_cast<const PPCSubtarget&>(DAG.getSubtarget());
	if (!Subtarget.hasP8Vector())
	return false;

	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 4;
	for (unsigned i = 0; i != 8; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
	return false;
	}
	return true;
	}

	/// isVMerge - Common function, used to match vmrg* shuffles.
	///
	static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned LHSStart, unsigned RHSStart) {
	if (N->getValueType(0) != MVT::v16i8)
	return false;
	assert((UnitSize == 1 \|\| UnitSize == 2 \|\| UnitSize == 4) &&
	"Unsupported merge size!");

	for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
	for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
	if (!isConstantOrUndef(N->getMaskElt(iUnitSize2+j),
	LHSStart+j+i*UnitSize) \|\|
	!isConstantOrUndef(N->getMaskElt(iUnitSize2+UnitSize+j),
	RHSStart+j+i*UnitSize))
	return false;
	}
	return true;
	}

	/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
	/// The ShuffleKind distinguishes between big-endian merges with two
	/// different inputs (0), either-endian merges with two identical inputs (1),
	/// and little-endian merges with two different inputs (2). For the latter,
	/// the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 0, 0);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, UnitSize, 0, 16);
	else
	return false;
	} else {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 8, 8);
	else if (ShuffleKind == 0) // normal
	return isVMerge(N, UnitSize, 8, 24);
	else
	return false;
	}
	}

	/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
	/// The ShuffleKind distinguishes between big-endian merges with two
	/// different inputs (0), either-endian merges with two identical inputs (1),
	/// and little-endian merges with two different inputs (2). For the latter,
	/// the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 8, 8);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, UnitSize, 8, 24);
	else
	return false;
	} else {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 0, 0);
	else if (ShuffleKind == 0) // normal
	return isVMerge(N, UnitSize, 0, 16);
	else
	return false;
	}
	}

	/**
	* Common function used to match vmrgew and vmrgow shuffles
	*
	* The indexOffset determines whether to look for even or odd words in
	* the shuffle mask. This is based on the of the endianness of the target
	* machine.
	* - Little Endian:
	* - Use offset of 0 to check for odd elements
	* - Use offset of 4 to check for even elements
	* - Big Endian:
	* - Use offset of 0 to check for even elements
	* - Use offset of 4 to check for odd elements
	* A detailed description of the vector element ordering for little endian and
	* big endian can be found at
	* http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
	* Targeting your applications - what little endian and big endian IBM XL C/C++
	* compiler differences mean to you
	*
	* The mask to the shuffle vector instruction specifies the indices of the
	* elements from the two input vectors to place in the result. The elements are
	* numbered in array-access order, starting with the first vector. These vectors
	* are always of type v16i8, thus each vector will contain 16 elements of size
	* 8. More info on the shuffle vector can be found in the
	* http://llvm.org/docs/LangRef.html#shufflevector-instruction
	* Language Reference.
	*
	* The RHSStartValue indicates whether the same input vectors are used (unary)
	* or two different input vectors are used, based on the following:
	* - If the instruction uses the same vector for both inputs, the range of the
	* indices will be 0 to 15. In this case, the RHSStart value passed should
	* be 0.
	* - If the instruction has two different vectors then the range of the
	* indices will be 0 to 31. In this case, the RHSStart value passed should
	* be 16 (indices 0-15 specify elements in the first vector while indices 16
	* to 31 specify elements in the second vector).
	*
	* \param[in] N The shuffle vector SD Node to analyze
	* \param[in] IndexOffset Specifies whether to look for even or odd elements
	* \param[in] RHSStartValue Specifies the starting index for the righthand input
	* vector to the shuffle_vector instruction
	* \return true iff this shuffle vector represents an even or odd word merge
	*/
	static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
	unsigned RHSStartValue) {
	if (N->getValueType(0) != MVT::v16i8)
	return false;

	for (unsigned i = 0; i < 2; ++i)
	for (unsigned j = 0; j < 4; ++j)
	if (!isConstantOrUndef(N->getMaskElt(i*4+j),
	i*RHSStartValue+j+IndexOffset) \|\|
	!isConstantOrUndef(N->getMaskElt(i*4+j+8),
	i*RHSStartValue+j+IndexOffset+8))
	return false;
	return true;
	}

	/**
	* Determine if the specified shuffle mask is suitable for the vmrgew or
	* vmrgow instructions.
	*
	* \param[in] N The shuffle vector SD Node to analyze
	* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
	* \param[in] ShuffleKind Identify the type of merge:
	* - 0 = big-endian merge with two different inputs;
	* - 1 = either-endian merge with two identical inputs;
	* - 2 = little-endian merge with two different inputs (inputs are swapped for
	* little-endian merges).
	* \param[in] DAG The current SelectionDAG
	* \return true iff this shuffle mask
	*/
	bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	unsigned indexOffset = CheckEven ? 4 : 0;
	if (ShuffleKind == 1) // Unary
	return isVMerge(N, indexOffset, 0);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, indexOffset, 16);
	else
	return false;
	}
	else {
	unsigned indexOffset = CheckEven ? 0 : 4;
	if (ShuffleKind == 1) // Unary
	return isVMerge(N, indexOffset, 0);
	else if (ShuffleKind == 0) // Normal
	return isVMerge(N, indexOffset, 16);
	else
	return false;
	}
	return false;
	}

	/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
	/// amount, otherwise return -1.
	/// The ShuffleKind distinguishes between big-endian operations with two
	/// different inputs (0), either-endian operations with two identical inputs
	/// (1), and little-endian operations with two different inputs (2). For the
	/// latter, the input operands are swapped (see PPCInstrAltivec.td).
	int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	if (N->getValueType(0) != MVT::v16i8)
	return -1;

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

	// Find the first non-undef value in the shuffle mask.
	unsigned i;
	for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
	/search/;

	if (i == 16) return -1; // all undef.

	// Otherwise, check to see if the rest of the elements are consecutively
	// numbered from this value.
	unsigned ShiftAmt = SVOp->getMaskElt(i);
	if (ShiftAmt < i) return -1;

	ShiftAmt -= i;
	bool isLE = DAG.getDataLayout().isLittleEndian();

	if ((ShuffleKind == 0 && !isLE) \|\| (ShuffleKind == 2 && isLE)) {
	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 16; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
	return -1;
	} else if (ShuffleKind == 1) {
	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 16; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
	return -1;
	} else
	return -1;

	if (isLE)
	ShiftAmt = 16 - ShiftAmt;

	return ShiftAmt;
	}

	/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
	/// specifies a splat of a single element that is suitable for input to
	/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
	bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
	assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
	EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");

	// The consecutive indices need to specify an element, not part of two
	// different elements. So abandon ship early if this isn't the case.
	if (N->getMaskElt(0) % EltSize != 0)
	return false;

	// This is a splat operation if each element of the permute is the same, and
	// if the value doesn't reference the second vector.
	unsigned ElementBase = N->getMaskElt(0);

	// FIXME: Handle UNDEF elements too!
	if (ElementBase >= 16)
	return false;

	// Check that the indices are consecutive, in the case of a multi-byte element
	// splatted with a v16i8 mask.
	for (unsigned i = 1; i != EltSize; ++i)
	if (N->getMaskElt(i) < 0 \|\| N->getMaskElt(i) != (int)(i+ElementBase))
	return false;

	for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
	if (N->getMaskElt(i) < 0) continue;
	for (unsigned j = 0; j != EltSize; ++j)
	if (N->getMaskElt(i+j) != N->getMaskElt(j))
	return false;
	}
	return true;
	}

	/// Check that the mask is shuffling N byte elements. Within each N byte
	/// element of the mask, the indices could be either in increasing or
	/// decreasing order as long as they are consecutive.
	/// \param[in] N the shuffle vector SD Node to analyze
	/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
	/// Word/DoubleWord/QuadWord).
	/// \param[in] StepLen the delta indices number among the N byte element, if
	/// the mask is in increasing/decreasing order then it is 1/-1.
	/// \return true iff the mask is shuffling N byte elements.
	static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
	int StepLen) {
	assert((Width == 2 \|\| Width == 4 \|\| Width == 8 \|\| Width == 16) &&
	"Unexpected element width.");
	assert((StepLen == 1 \|\| StepLen == -1) && "Unexpected element width.");

	unsigned NumOfElem = 16 / Width;
	unsigned MaskVal[16]; // Width is never greater than 16
	for (unsigned i = 0; i < NumOfElem; ++i) {
	MaskVal[0] = N->getMaskElt(i * Width);
	if ((StepLen == 1) && (MaskVal[0] % Width)) {
	return false;
	} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
	return false;
	}

	for (unsigned int j = 1; j < Width; ++j) {
	MaskVal[j] = N->getMaskElt(i * Width + j);
	if (MaskVal[j] != MaskVal[j-1] + StepLen) {
	return false;
	}
	}
	}

	return true;
	}

	bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	unsigned &InsertAtByte, bool &Swap, bool IsLE) {
	if (!isNByteElemShuffleMask(N, 4, 1))
	return false;

	// Now we look at mask elements 0,4,8,12
	unsigned M0 = N->getMaskElt(0) / 4;
	unsigned M1 = N->getMaskElt(4) / 4;
	unsigned M2 = N->getMaskElt(8) / 4;
	unsigned M3 = N->getMaskElt(12) / 4;
	unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
	unsigned BigEndianShifts[] = { 3, 0, 1, 2 };

	// Below, let H and L be arbitrary elements of the shuffle mask
	// where H is in the range [4,7] and L is in the range [0,3].
	// H, 1, 2, 3 or L, 5, 6, 7
	if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) \|\|
	(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
	InsertAtByte = IsLE ? 12 : 0;
	Swap = M0 < 4;
	return true;
	}
	// 0, H, 2, 3 or 4, L, 6, 7
	if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) \|\|
	(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
	InsertAtByte = IsLE ? 8 : 4;
	Swap = M1 < 4;
	return true;
	}
	// 0, 1, H, 3 or 4, 5, L, 7
	if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) \|\|
	(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
	InsertAtByte = IsLE ? 4 : 8;
	Swap = M2 < 4;
	return true;
	}
	// 0, 1, 2, H or 4, 5, 6, L
	if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) \|\|
	(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
	ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
	InsertAtByte = IsLE ? 0 : 12;
	Swap = M3 < 4;
	return true;
	}

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	ShiftElts = 0;
	Swap = true;
	unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
	if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
	InsertAtByte = IsLE ? 12 : 0;
	return true;
	}
	if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
	InsertAtByte = IsLE ? 8 : 4;
	return true;
	}
	if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
	InsertAtByte = IsLE ? 4 : 8;
	return true;
	}
	if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
	InsertAtByte = IsLE ? 0 : 12;
	return true;
	}
	}

	return false;
	}

	bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
	// Ensure each byte index of the word is consecutive.
	if (!isNByteElemShuffleMask(N, 4, 1))
	return false;

	// Now we look at mask elements 0,4,8,12, which are the beginning of words.
	unsigned M0 = N->getMaskElt(0) / 4;
	unsigned M1 = N->getMaskElt(4) / 4;
	unsigned M2 = N->getMaskElt(8) / 4;
	unsigned M3 = N->getMaskElt(12) / 4;

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	assert(M0 < 4 && "Indexing into an undef vector?");
	if (M1 != (M0 + 1) % 4 \|\| M2 != (M1 + 1) % 4 \|\| M3 != (M2 + 1) % 4)
	return false;

	ShiftElts = IsLE ? (4 - M0) % 4 : M0;
	Swap = false;
	return true;
	}

	// Ensure each word index of the ShuffleVector Mask is consecutive.
	if (M1 != (M0 + 1) % 8 \|\| M2 != (M1 + 1) % 8 \|\| M3 != (M2 + 1) % 8)
	return false;

	if (IsLE) {
	if (M0 == 0 \|\| M0 == 7 \|\| M0 == 6 \|\| M0 == 5) {
	// Input vectors don't need to be swapped if the leading element
	// of the result is one of the 3 left elements of the second vector
	// (or if there is no shift to be done at all).
	Swap = false;
	ShiftElts = (8 - M0) % 8;
	} else if (M0 == 4 \|\| M0 == 3 \|\| M0 == 2 \|\| M0 == 1) {
	// Input vectors need to be swapped if the leading element
	// of the result is one of the 3 left elements of the first vector
	// (or if we're shifting by 4 - thereby simply swapping the vectors).
	Swap = true;
	ShiftElts = (4 - M0) % 4;
	}

	return true;
	} else { // BE
	if (M0 == 0 \|\| M0 == 1 \|\| M0 == 2 \|\| M0 == 3) {
	// Input vectors don't need to be swapped if the leading element
	// of the result is one of the 4 elements of the first vector.
	Swap = false;
	ShiftElts = M0;
	} else if (M0 == 4 \|\| M0 == 5 \|\| M0 == 6 \|\| M0 == 7) {
	// Input vectors need to be swapped if the leading element
	// of the result is one of the 4 elements of the right vector.
	Swap = true;
	ShiftElts = M0 - 4;
	}

	return true;
	}
	}

	bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");

	if (!isNByteElemShuffleMask(N, Width, -1))
	return false;

	for (int i = 0; i < 16; i += Width)
	if (N->getMaskElt(i) != i + Width - 1)
	return false;

	return true;
	}

	bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 2);
	}

	bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 4);
	}

	bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 8);
	}

	bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 16);
	}

	/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
	/// if the inputs to the instruction should be swapped and set \p DM to the
	/// value for the immediate.
	/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
	/// AND element 0 of the result comes from the first input (LE) or second input
	/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
	/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
	/// mask.
	bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
	bool &Swap, bool IsLE) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");

	// Ensure each byte index of the double word is consecutive.
	if (!isNByteElemShuffleMask(N, 8, 1))
	return false;

	unsigned M0 = N->getMaskElt(0) / 8;
	unsigned M1 = N->getMaskElt(8) / 8;
	assert(((M0 \| M1) < 4) && "A mask element out of bounds?");

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	if ((M0 \| M1) < 2) {
	DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
	Swap = false;
	return true;
	} else
	return false;
	}

	if (IsLE) {
	if (M0 > 1 && M1 < 2) {
	Swap = false;
	} else if (M0 < 2 && M1 > 1) {
	M0 = (M0 + 2) % 4;
	M1 = (M1 + 2) % 4;
	Swap = true;
	} else
	return false;

	// Note: if control flow comes here that means Swap is already set above
	DM = (((~M1) & 1) << 1) + ((~M0) & 1);
	return true;
	} else { // BE
	if (M0 < 2 && M1 > 1) {
	Swap = false;
	} else if (M0 > 1 && M1 < 2) {
	M0 = (M0 + 2) % 4;
	M1 = (M1 + 2) % 4;
	Swap = true;
	} else
	return false;

	// Note: if control flow comes here that means Swap is already set above
	DM = (M0 << 1) + (M1 & 1);
	return true;
	}
	}


	/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
	/// appropriate for PPC mnemonics (which have a big endian bias - namely
	/// elements are counted from the left of the vector register).
	unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	assert(isSplatShuffleMask(SVOp, EltSize));
	if (DAG.getDataLayout().isLittleEndian())
	return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
	else
	return SVOp->getMaskElt(0) / EltSize;
	}

	/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
	/// by using a vspltis[bhw] instruction of the specified element size, return
	/// the constant being splatted. The ByteSize field indicates the number of
	/// bytes of each element [124] -> [bhw].
	SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
	SDValue OpVal(nullptr, 0);

	// If ByteSize of the splat is bigger than the element size of the
	// build_vector, then we have a case where we are checking for a splat where
	// multiple elements of the buildvector are folded together into a single
	// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
	unsigned EltSize = 16/N->getNumOperands();
	if (EltSize < ByteSize) {
	unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
	SDValue UniquedVals[4];
	assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");

	// See if all of the elements in the buildvector agree across.
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	if (N->getOperand(i).isUndef()) continue;
	// If the element isn't a constant, bail fully out.
	if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();

	if (!UniquedVals[i&(Multiple-1)].getNode())
	UniquedVals[i&(Multiple-1)] = N->getOperand(i);
	else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
	return SDValue(); // no match.
	}

	// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
	// either constant or undef values that are identical for each chunk. See
	// if these chunks can form into a larger vspltis*.

	// Check to see if all of the leading entries are either 0 or -1. If
	// neither, then this won't fit into the immediate field.
	bool LeadingZero = true;
	bool LeadingOnes = true;
	for (unsigned i = 0; i != Multiple-1; ++i) {
	if (!UniquedVals[i].getNode()) continue; // Must have been undefs.

	LeadingZero &= isNullConstant(UniquedVals[i]);
	LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
	}
	// Finally, check the least significant entry.
	if (LeadingZero) {
	if (!UniquedVals[Multiple-1].getNode())
	return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
	int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
	if (Val < 16) // 0,0,0,4 -> vspltisw(4)
	return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
	}
	if (LeadingOnes) {
	if (!UniquedVals[Multiple-1].getNode())
	return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
	int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
	if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
	return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
	}

	return SDValue();
	}

	// Check to see if this buildvec has a single non-undef value in its elements.
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	if (N->getOperand(i).isUndef()) continue;
	if (!OpVal.getNode())
	OpVal = N->getOperand(i);
	else if (OpVal != N->getOperand(i))
	return SDValue();
	}

	if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.

	unsigned ValSizeInBytes = EltSize;
	uint64_t Value = 0;
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
	Value = CN->getZExtValue();
	} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
	assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
	Value = FloatToBits(CN->getValueAPF().convertToFloat());
	}

	// If the splat value is larger than the element value, then we can never do
	// this splat. The only case that we could fit the replicated bits into our
	// immediate field for would be zero, and we prefer to use vxor for it.
	if (ValSizeInBytes < ByteSize) return SDValue();

	// If the element value is larger than the splat value, check if it consists
	// of a repeated bit pattern of size ByteSize.
	if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
	return SDValue();

	// Properly sign extend the value.
	int MaskVal = SignExtend32(Value, ByteSize * 8);

	// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
	if (MaskVal == 0) return SDValue();

	// Finally, if this value fits in a 5 bit sext field, return it
	if (SignExtend32<5>(MaskVal) == MaskVal)
	return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
	return SDValue();
	}

	/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
	/// amount, otherwise return -1.
	int PPC::isQVALIGNIShuffleMask(SDNode *N) {
	EVT VT = N->getValueType(0);
	if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
	return -1;

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

	// Find the first non-undef value in the shuffle mask.
	unsigned i;
	for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
	/search/;

	if (i == 4) return -1; // all undef.

	// Otherwise, check to see if the rest of the elements are consecutively
	// numbered from this value.
	unsigned ShiftAmt = SVOp->getMaskElt(i);
	if (ShiftAmt < i) return -1;
	ShiftAmt -= i;

	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 4; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
	return -1;

	return ShiftAmt;
	}

	//===----------------------------------------------------------------------===//
	// Addressing Mode Selection
	//===----------------------------------------------------------------------===//

	/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
	/// or 64-bit immediate, and if the value can be accurately represented as a
	/// sign extension from a 16-bit value. If so, this returns true and the
	/// immediate.
	bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
	if (!isa<ConstantSDNode>(N))
	return false;

	Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
	if (N->getValueType(0) == MVT::i32)
	return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
	else
	return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
	}
	bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
	return isIntS16Immediate(Op.getNode(), Imm);
	}


	/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
	/// be represented as an indexed [r+r] operation.
	bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
	SDValue &Index,
	SelectionDAG &DAG) const {
	for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
	UI != E; ++UI) {
	if (MemSDNode Memop = dyn_cast<MemSDNode>(UI)) {
	if (Memop->getMemoryVT() == MVT::f64) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}
	}
	}
	return false;
	}

	/// SelectAddressRegReg - Given the specified addressed, check to see if it
	/// can be represented as an indexed [r+r] operation. Returns false if it
	/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
	/// non-zero and N can be represented by a base register plus a signed 16-bit
	/// displacement, make a more precise judgement by checking (displacement % \p
	/// EncodingAlignment).
	bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
	SDValue &Index, SelectionDAG &DAG,
	unsigned EncodingAlignment) const {
	int16_t imm = 0;
	if (N.getOpcode() == ISD::ADD) {
	// Is there any SPE load/store (f64), which can't handle 16bit offset?
	// SPE load/store can only handle 8-bit offsets.
	if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
	return true;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!EncodingAlignment \|\| !(imm % EncodingAlignment)))
	return false; // r+i
	if (N.getOperand(1).getOpcode() == PPCISD::Lo)
	return false; // r+i

	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	} else if (N.getOpcode() == ISD::OR) {
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!EncodingAlignment \|\| !(imm % EncodingAlignment)))
	return false; // r+i can fold it if we can.

	// If this is an or of disjoint bitfields, we can codegen this as an add
	// (for better address arithmetic) if the LHS and RHS of the OR are provably
	// disjoint.
	KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));

	if (LHSKnown.Zero.getBoolValue()) {
	KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
	// If all of the bits are known zero on the LHS or RHS, the add won't
	// carry.
	if (~(LHSKnown.Zero \| RHSKnown.Zero) == 0) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}
	}
	}

	return false;
	}

	// If we happen to be doing an i64 load or store into a stack slot that has
	// less than a 4-byte alignment, then the frame-index elimination may need to
	// use an indexed load or store instruction (because the offset may not be a
	// multiple of 4). The extra register needed to hold the offset comes from the
	// register scavenger, and it is possible that the scavenger will need to use
	// an emergency spill slot. As a result, we need to make sure that a spill slot
	// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
	// stack slot.
	static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
	// FIXME: This does not handle the LWA case.
	if (VT != MVT::i64)
	return;

	// NOTE: We'll exclude negative FIs here, which come from argument
	// lowering, because there are no known test cases triggering this problem
	// using packed structures (or similar). We can remove this exclusion if
	// we find such a test case. The reason why this is so test-case driven is
	// because this entire 'fixup' is only to prevent crashes (from the
	// register scavenger) on not-really-valid inputs. For example, if we have:
	// %a = alloca i1
	// %b = bitcast i1* %a to i64*
	// store i64* a, i64 b
	// then the store should really be marked as 'align 1', but is not. If it
	// were marked as 'align 1' then the indexed form would have been
	// instruction-selected initially, and the problem this 'fixup' is preventing
	// won't happen regardless.
	if (FrameIdx < 0)
	return;

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	unsigned Align = MFI.getObjectAlignment(FrameIdx);
	if (Align >= 4)
	return;

	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setHasNonRISpills();
	}

	/// Returns true if the address N can be represented by a base register plus
	/// a signed 16-bit displacement [r+imm], and if it is not better
	/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
	/// displacements that are multiples of that value.
	bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
	SDValue &Base,
	SelectionDAG &DAG,
	unsigned EncodingAlignment) const {
	// FIXME dl should come from parent load or store, not from address
	SDLoc dl(N);
	// If this can be more profitably realized as r+r, fail.
	if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
	return false;

	if (N.getOpcode() == ISD::ADD) {
	int16_t imm = 0;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!EncodingAlignment \|\| (imm % EncodingAlignment) == 0)) {
	Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else {
	Base = N.getOperand(0);
	}
	return true; // [r+i]
	} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
	// Match LOAD (ADD (X, Lo(G))).
	assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
	&& "Cannot handle constant offsets yet!");
	Disp = N.getOperand(1).getOperand(0); // The global address.
	assert(Disp.getOpcode() == ISD::TargetGlobalAddress \|\|
	Disp.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
	Disp.getOpcode() == ISD::TargetConstantPool \|\|
	Disp.getOpcode() == ISD::TargetJumpTable);
	Base = N.getOperand(0);
	return true; // [&g+r]
	}
	} else if (N.getOpcode() == ISD::OR) {
	int16_t imm = 0;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!EncodingAlignment \|\| (imm % EncodingAlignment) == 0)) {
	// If this is an or of disjoint bitfields, we can codegen this as an add
	// (for better address arithmetic) if the LHS and RHS of the OR are
	// provably disjoint.
	KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));

	if ((LHSKnown.Zero.getZExtValue()\|~(uint64_t)imm) == ~0ULL) {
	// If all of the bits are known zero on the LHS or RHS, the add won't
	// carry.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else {
	Base = N.getOperand(0);
	}
	Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
	return true;
	}
	}
	} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
	// Loading from a constant address.

	// If this address fits entirely in a 16-bit sext immediate field, codegen
	// this as "d, 0"
	int16_t Imm;
	if (isIntS16Immediate(CN, Imm) &&
	(!EncodingAlignment \|\| (Imm % EncodingAlignment) == 0)) {
	Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
	Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
	CN->getValueType(0));
	return true;
	}

	// Handle 32-bit sext immediates with LIS + addr mode.
	if ((CN->getValueType(0) == MVT::i32 \|\|
	(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
	(!EncodingAlignment \|\| (CN->getZExtValue() % EncodingAlignment) == 0)) {
	int Addr = (int)CN->getZExtValue();

	// Otherwise, break this down into an LIS + disp.
	Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);

	Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
	MVT::i32);
	unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
	Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
	return true;
	}
	}

	Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else
	Base = N;
	return true; // [r+0]
	}

	/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
	/// represented as an indexed [r+r] operation.
	bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
	SDValue &Index,
	SelectionDAG &DAG) const {
	// Check to see if we can easily represent this as an [r+r] address. This
	// will fail if it thinks that the address is more profitably represented as
	// reg+imm, e.g. where imm = 0.
	if (SelectAddressRegReg(N, Base, Index, DAG))
	return true;

	// If the address is the result of an add, we will utilize the fact that the
	// address calculation includes an implicit add. However, we can reduce
	// register pressure if we do not materialize a constant just for use as the
	// index register. We only get rid of the add if it is not an add of a
	// value and a 16-bit signed constant and both have a single use.
	int16_t imm = 0;
	if (N.getOpcode() == ISD::ADD &&
	(!isIntS16Immediate(N.getOperand(1), imm) \|\|
	!N.getOperand(1).hasOneUse() \|\| !N.getOperand(0).hasOneUse())) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}

	// Otherwise, do it the hard way, using R0 as the base register.
	Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
	N.getValueType());
	Index = N;
	return true;
	}

	/// Returns true if we should use a direct load into vector instruction
	/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
	static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {

	// If there are any other uses other than scalar to vector, then we should
	// keep it as a scalar load -> direct move pattern to prevent multiple
	// loads.
	LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
	if (!LD)
	return false;

	EVT MemVT = LD->getMemoryVT();
	if (!MemVT.isSimple())
	return false;
	switch(MemVT.getSimpleVT().SimpleTy) {
	case MVT::i64:
	break;
	case MVT::i32:
	if (!ST.hasP8Vector())
	return false;
	break;
	case MVT::i16:
	case MVT::i8:
	if (!ST.hasP9Vector())
	return false;
	break;
	default:
	return false;
	}

	SDValue LoadedVal(N, 0);
	if (!LoadedVal.hasOneUse())
	return false;

	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
	UI != UE; ++UI)
	if (UI.getUse().get().getResNo() == 0 &&
	UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
	return false;

	return true;
	}

	/// getPreIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	if (DisablePPCPreinc) return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	unsigned Alignment;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	Ptr = LD->getBasePtr();
	VT = LD->getMemoryVT();
	Alignment = LD->getAlignment();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	Ptr = ST->getBasePtr();
	VT = ST->getMemoryVT();
	Alignment = ST->getAlignment();
	isLoad = false;
	} else
	return false;

	// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
	// instructions because we can fold these into a more efficient instruction
	// instead, (such as LXSD).
	if (isLoad && usePartialVectorLoads(N, Subtarget)) {
	return false;
	}

	// PowerPC doesn't have preinc load/store instructions for vectors (except
	// for QPX, which does have preinc r+r forms).
	if (VT.isVector()) {
	if (!Subtarget.hasQPX() \|\| (VT != MVT::v4f64 && VT != MVT::v4f32)) {
	return false;
	} else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
	AM = ISD::PRE_INC;
	return true;
	}
	}

	if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
	// Common code will reject creating a pre-inc form if the base pointer
	// is a frame index, or if N is a store and the base pointer is either
	// the same as or a predecessor of the value being stored. Check for
	// those situations here, and try with swapped Base/Offset instead.
	bool Swap = false;

	if (isa<FrameIndexSDNode>(Base) \|\| isa<RegisterSDNode>(Base))
	Swap = true;
	else if (!isLoad) {
	SDValue Val = cast<StoreSDNode>(N)->getValue();
	if (Val == Base \|\| Base.getNode()->isPredecessorOf(Val.getNode()))
	Swap = true;
	}

	if (Swap)
	std::swap(Base, Offset);

	AM = ISD::PRE_INC;
	return true;
	}

	// LDU/STU can only handle immediates that are a multiple of 4.
	if (VT != MVT::i64) {
	if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
	return false;
	} else {
	// LDU/STU need an address with at least 4-byte alignment.
	if (Alignment < 4)
	return false;

	if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
	return false;
	}

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
	// sext i32 to i64 when addr mode is r+i.
	if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
	LD->getExtensionType() == ISD::SEXTLOAD &&
	isa<ConstantSDNode>(Offset))
	return false;
	}

	AM = ISD::PRE_INC;
	return true;
	}

	//===----------------------------------------------------------------------===//
	// LowerOperation implementation
	//===----------------------------------------------------------------------===//

	/// Return true if we should reference labels using a PICBase, set the HiOpFlags
	/// and LoOpFlags to the target MO flags.
	static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
	unsigned &HiOpFlags, unsigned &LoOpFlags,
	const GlobalValue *GV = nullptr) {
	HiOpFlags = PPCII::MO_HA;
	LoOpFlags = PPCII::MO_LO;

	// Don't use the pic base if not in PIC relocation model.
	if (IsPIC) {
	HiOpFlags \|= PPCII::MO_PIC_FLAG;
	LoOpFlags \|= PPCII::MO_PIC_FLAG;
	}

	// If this is a reference to a global value that requires a non-lazy-ptr, make
	// sure that instruction lowering adds it.
	if (GV && Subtarget.hasLazyResolverStub(GV)) {
	HiOpFlags \|= PPCII::MO_NLP_FLAG;
	LoOpFlags \|= PPCII::MO_NLP_FLAG;

	if (GV->hasHiddenVisibility()) {
	HiOpFlags \|= PPCII::MO_NLP_HIDDEN_FLAG;
	LoOpFlags \|= PPCII::MO_NLP_HIDDEN_FLAG;
	}
	}
	}

	static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
	SelectionDAG &DAG) {
	SDLoc DL(HiPart);
	EVT PtrVT = HiPart.getValueType();
	SDValue Zero = DAG.getConstant(0, DL, PtrVT);

	SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
	SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);

	// With PIC, the first instruction is actually "GR+hi(&G)".
	if (isPIC)
	Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);

	// Generate non-pic code that has direct accesses to the constant pool.
	// The address of the global is just (hi(&g)+lo(&g)).
	return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
	}

	static void setUsesTOCBasePtr(MachineFunction &MF) {
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setUsesTOCBasePtr();
	}

	static void setUsesTOCBasePtr(SelectionDAG &DAG) {
	setUsesTOCBasePtr(DAG.getMachineFunction());
	}

	SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
	SDValue GA) const {
	const bool Is64Bit = Subtarget.isPPC64();
	EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
	SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
	: Subtarget.isAIXABI()
	? DAG.getRegister(PPC::R2, VT)
	: DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
	SDValue Ops[] = { GA, Reg };
	return DAG.getMemIntrinsicNode(
	PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
	MachineMemOperand::MOLoad);
	}

	SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
	const Constant *C = CP->getConstVal();

	// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
	return getTOCEntry(DAG, SDLoc(CP), GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, SDLoc(CP), GA);
	}

	SDValue CPIHi =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
	SDValue CPILo =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
	return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
	}

	// For 64-bit PowerPC, prefer the more compact relative encodings.
	// This trades 32 bits per jump table entry for one or two instructions
	// on the jump site.
	unsigned PPCTargetLowering::getJumpTableEncoding() const {
	if (isJumpTableRelative())
	return MachineJumpTableInfo::EK_LabelDifference32;

	return TargetLowering::getJumpTableEncoding();
	}

	bool PPCTargetLowering::isJumpTableRelative() const {
	if (UseAbsoluteJumpTables)
	return false;
	if (Subtarget.isPPC64() \|\| Subtarget.isAIXABI())
	return true;
	return TargetLowering::isJumpTableRelative();
	}

	SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.isPPC64() \|\| Subtarget.isAIXABI())
	return TargetLowering::getPICJumpTableRelocBase(Table, DAG);

	switch (getTargetMachine().getCodeModel()) {
	case CodeModel::Small:
	case CodeModel::Medium:
	return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
	default:
	return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	}
	}

	const MCExpr *
	PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI,
	MCContext &Ctx) const {
	if (!Subtarget.isPPC64() \|\| Subtarget.isAIXABI())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	switch (getTargetMachine().getCodeModel()) {
	case CodeModel::Small:
	case CodeModel::Medium:
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
	default:
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}
	}

	SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
	return getTOCEntry(DAG, SDLoc(JT), GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, SDLoc(GA), GA);
	}

	SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
	SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
	return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
	const BlockAddress *BA = BASDN->getBlockAddress();

	// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
	// The actual BlockAddress is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
	return getTOCEntry(DAG, SDLoc(BASDN), GA);
	}

	// 32-bit position-independent ELF stores the BlockAddress in the .got.
	if (Subtarget.is32BitELFABI() && isPositionIndependent())
	return getTOCEntry(
	DAG, SDLoc(BASDN),
	DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
	SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
	SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
	return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	// FIXME: TLS addresses currently use medium model code sequences,
	// which is the most useful form. Eventually support for small and
	// large models could be added if users need it, at the cost of
	// additional complexity.
	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	SDLoc dl(GA);
	const GlobalValue *GV = GA->getGlobal();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool is64bit = Subtarget.isPPC64();
	const Module *M = DAG.getMachineFunction().getFunction().getParent();
	PICLevel::Level picLevel = M->getPICLevel();

	const TargetMachine &TM = getTargetMachine();
	TLSModel::Model Model = TM.getTLSModel(GV);

	if (Model == TLSModel::LocalExec) {
	SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TPREL_HA);
	SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TPREL_LO);
	SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
	: DAG.getRegister(PPC::R2, MVT::i32);

	SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
	return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
	}

	if (Model == TLSModel::InitialExec) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TLS);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
	PtrVT, GOTReg, TGA);
	} else {
	if (!TM.isPositionIndependent())
	GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
	else if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
	PtrVT, TGA, GOTPtr);
	return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
	}

	if (Model == TLSModel::GeneralDynamic) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
	GOTReg, TGA);
	} else {
	if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
	GOTPtr, TGA, TGA);
	}

	if (Model == TLSModel::LocalDynamic) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
	GOTReg, TGA);
	} else {
	if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
	PtrVT, GOTPtr, TGA, TGA);
	SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
	PtrVT, TLSAddr, TGA);
	return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
	}

	llvm_unreachable("Unknown TLS model!");
	}

	SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
	SDLoc DL(GSDN);
	const GlobalValue *GV = GSDN->getGlobal();

	// 64-bit SVR4 ABI & AIX ABI code is always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
	return getTOCEntry(DAG, DL, GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
	GSDN->getOffset(),
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, DL, GA);
	}

	SDValue GAHi =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
	SDValue GALo =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);

	SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);

	// If the global reference is actually to a non-lazy-pointer, we have to do an
	// extra load to get the address of the global.
	if (MOHiFlag & PPCII::MO_NLP_FLAG)
	Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
	return Ptr;
	}

	SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	if (Op.getValueType() == MVT::v2i64) {
	// When the operands themselves are v2i64 values, we need to do something
	// special because VSX has no underlying comparison operations for these.
	if (Op.getOperand(0).getValueType() == MVT::v2i64) {
	// Equality can be handled by casting to the legal type for Altivec
	// comparisons, everything else needs to be expanded.
	if (CC == ISD::SETEQ \|\| CC == ISD::SETNE) {
	return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
	DAG.getSetCC(dl, MVT::v4i32,
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
	CC));
	}

	return SDValue();
	}

	// We handle most of these in the usual way.
	return Op;
	}

	// If we're comparing for equality to zero, expose the fact that this is
	// implemented as a ctlz/srl pair on ppc, so that the dag combiner can
	// fold the new nodes.
	if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
	return V;

	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	// Leave comparisons against 0 and -1 alone for now, since they're usually
	// optimized. FIXME: revisit this when we can custom lower all setcc
	// optimizations.
	if (C->isAllOnesValue() \|\| C->isNullValue())
	return SDValue();
	}

	// If we have an integer seteq/setne, turn it into a compare against zero
	// by xor'ing the rhs with the lhs, which is faster than setting a
	// condition register, reading it back out, and masking the correct bit. The
	// normal approach here uses sub to do this instead of xor. Using xor exposes
	// the result to other bit-twiddling opportunities.
	EVT LHSVT = Op.getOperand(0).getValueType();
	if (LHSVT.isInteger() && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	EVT VT = Op.getValueType();
	SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
	Op.getOperand(1));
	return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	SDNode *Node = Op.getNode();
	EVT VT = Node->getValueType(0);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue InChain = Node->getOperand(0);
	SDValue VAListPtr = Node->getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	SDLoc dl(Node);

	assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");

	// gpr_index
	SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
	VAListPtr, MachinePointerInfo(SV), MVT::i8);
	InChain = GprIndex.getValue(1);

	if (VT == MVT::i64) {
	// Check if GprIndex is even
	SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
	DAG.getConstant(1, dl, MVT::i32));
	SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
	DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
	SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
	DAG.getConstant(1, dl, MVT::i32));
	// Align GprIndex to be even if it isn't
	GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
	GprIndex);
	}

	// fpr index is 1 byte after gpr
	SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(1, dl, MVT::i32));

	// fpr
	SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
	FprPtr, MachinePointerInfo(SV), MVT::i8);
	InChain = FprIndex.getValue(1);

	SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(8, dl, MVT::i32));

	SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(4, dl, MVT::i32));

	// areas
	SDValue OverflowArea =
	DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
	InChain = OverflowArea.getValue(1);

	SDValue RegSaveArea =
	DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
	InChain = RegSaveArea.getValue(1);

	// select overflow_area if index > 8
	SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);

	// adjustment constant gpr_index * 4/8
	SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
	VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
	MVT::i32));

	// OurReg = RegSaveArea + RegConstant
	SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
	RegConstant);

	// Floating types are 32 bytes into RegSaveArea
	if (VT.isFloatingPoint())
	OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
	DAG.getConstant(32, dl, MVT::i32));

	// increase {f,g}pr_index by 1 (or 2 if VT is i64)
	SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
	VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
	MVT::i32));

	InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
	VT.isInteger() ? VAListPtr : FprPtr,
	MachinePointerInfo(SV), MVT::i8);

	// determine if we should load from reg_save_area or overflow_area
	SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);

	// increase overflow_area by 4/8 if gpr/fpr > 8
	SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
	DAG.getConstant(VT.isInteger() ? 4 : 8,
	dl, MVT::i32));

	OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
	OverflowAreaPlusN);

	InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
	MachinePointerInfo(), MVT::i32);

	return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
	assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");

	// We have to copy the entire va_list struct:
	// 2sizeof(char) + 2 Byte alignment + 2sizeof(char*) = 12 Byte
	return DAG.getMemcpy(Op.getOperand(0), Op,
	Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
	false, MachinePointerInfo(), MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	if (Subtarget.isAIXABI())
	report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");

	return Op.getOperand(0);
	}

	SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	if (Subtarget.isAIXABI())
	report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");

	SDValue Chain = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl(Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool isPPC64 = (PtrVT == MVT::i64);
	Type IntPtrTy = DAG.getDataLayout().getIntPtrType(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Ty = IntPtrTy;
	Entry.Node = Trmp; Args.push_back(Entry);

	// TrampSize == (isPPC64 ? 48 : 40);
	Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
	isPPC64 ? MVT::i64 : MVT::i32);
	Args.push_back(Entry);

	Entry.Node = FPtr; Args.push_back(Entry);
	Entry.Node = Nest; Args.push_back(Entry);

	// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
	CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	SDLoc dl(Op);

	if (Subtarget.isDarwinABI() \|\| Subtarget.isPPC64()) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
	// We suppose the given va_list is already allocated.
	//
	// typedef struct {
	// char gpr; /* index into the array of 8 GPRs
	// * stored in the register save area
	// * gpr=0 corresponds to r3,
	// * gpr=1 to r4, etc.
	// */
	// char fpr; /* index into the array of 8 FPRs
	// * stored in the register save area
	// * fpr=0 corresponds to f1,
	// * fpr=1 to f2, etc.
	// */
	// char *overflow_arg_area;
	// /* location on stack that holds
	// * the next overflow argument
	// */
	// char *reg_save_area;
	// /* where r3:r10 and f1:f8 (if saved)
	// * are stored
	// */
	// } va_list[1];

	SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
	SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
	SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
	PtrVT);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	PtrVT);

	uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
	SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);

	uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
	SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);

	uint64_t FPROffset = 1;
	SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

	// Store first byte : number of int regs
	SDValue firstStore =
	DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
	MachinePointerInfo(SV), MVT::i8);
	uint64_t nextOffset = FPROffset;
	SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
	ConstFPROffset);

	// Store second byte : number of float regs
	SDValue secondStore =
	DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
	MachinePointerInfo(SV, nextOffset), MVT::i8);
	nextOffset += StackOffset;
	nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);

	// Store second word : arguments given on stack
	SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
	MachinePointerInfo(SV, nextOffset));
	nextOffset += FrameOffset;
	nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);

	// Store third word : arguments given in registers
	return DAG.getStore(thirdStore, dl, FR, nextPtr,
	MachinePointerInfo(SV, nextOffset));
	}

	/// FPR - The set of FP registers that should be allocated for arguments
	/// on Darwin and AIX.
	static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
	PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
	PPC::F11, PPC::F12, PPC::F13};

	/// QFPR - The set of QPX registers that should be allocated for arguments.
	static const MCPhysReg QFPR[] = {
	PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
	PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};

	/// CalculateStackSlotSize - Calculates the size reserved for this argument on
	/// the stack.
	static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize) {
	unsigned ArgSize = ArgVT.getStoreSize();
	if (Flags.isByVal())
	ArgSize = Flags.getByValSize();

	// Round up to multiples of the pointer size, except for array members,
	// which are always packed.
	if (!Flags.isInConsecutiveRegs())
	ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;

	return ArgSize;
	}

	/// CalculateStackSlotAlignment - Calculates the alignment of this argument
	/// on the stack.
	static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
	ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize) {
	unsigned Align = PtrByteSize;

	// Altivec parameters are padded to a 16 byte boundary.
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64 \|\|
	ArgVT == MVT::v1i128 \|\| ArgVT == MVT::f128)
	Align = 16;
	// QPX vector types stored in double-precision are padded to a 32 byte
	// boundary.
	else if (ArgVT == MVT::v4f64 \|\| ArgVT == MVT::v4i1)
	Align = 32;

	// ByVal parameters are aligned as requested.
	if (Flags.isByVal()) {
	unsigned BVAlign = Flags.getByValAlign();
	if (BVAlign > PtrByteSize) {
	if (BVAlign % PtrByteSize != 0)
	llvm_unreachable(
	"ByVal alignment is not a multiple of the pointer size");

	Align = BVAlign;
	}
	}

	// Array members are always packed to their original alignment.
	if (Flags.isInConsecutiveRegs()) {
	// If the array member was split into multiple registers, the first
	// needs to be aligned to the size of the full type. (Except for
	// ppcf128, which is only aligned as its f64 components.)
	if (Flags.isSplit() && OrigVT != MVT::ppcf128)
	Align = OrigVT.getStoreSize();
	else
	Align = ArgVT.getStoreSize();
	}

	return Align;
	}

	/// CalculateStackSlotUsed - Return whether this argument will use its
	/// stack slot (instead of being passed in registers). ArgOffset,
	/// AvailableFPRs, and AvailableVRs must hold the current argument
	/// position, and will be updated to account for this argument.
	static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
	ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize,
	unsigned LinkageSize,
	unsigned ParamAreaSize,
	unsigned &ArgOffset,
	unsigned &AvailableFPRs,
	unsigned &AvailableVRs, bool HasQPX) {
	bool UseMemory = false;

	// Respect alignment of argument on the stack.
	unsigned Align =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
	// If there's no space left in the argument save area, we must
	// use memory (this check also catches zero-sized arguments).
	if (ArgOffset >= LinkageSize + ParamAreaSize)
	UseMemory = true;

	// Allocate argument on the stack.
	ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// If we overran the argument save area, we must use memory
	// (this check catches arguments passed partially in memory)
	if (ArgOffset > LinkageSize + ParamAreaSize)
	UseMemory = true;

	// However, if the argument is actually passed in an FPR or a VR,
	// we don't use memory after all.
	if (!Flags.isByVal()) {
	if (ArgVT == MVT::f32 \|\| ArgVT == MVT::f64 \|\|
	// QPX registers overlap with the scalar FP registers.
	(HasQPX && (ArgVT == MVT::v4f32 \|\|
	ArgVT == MVT::v4f64 \|\|
	ArgVT == MVT::v4i1)))
	if (AvailableFPRs > 0) {
	--AvailableFPRs;
	return false;
	}
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64 \|\|
	ArgVT == MVT::v1i128 \|\| ArgVT == MVT::f128)
	if (AvailableVRs > 0) {
	--AvailableVRs;
	return false;
	}
	}

	return UseMemory;
	}

	/// EnsureStackAlignment - Round stack frame size up from NumBytes to
	/// ensure minimum alignment required for target.
	static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
	unsigned NumBytes) {
	unsigned TargetAlign = Lowering->getStackAlignment();
	unsigned AlignMask = TargetAlign - 1;
	NumBytes = (NumBytes + AlignMask) & ~AlignMask;
	return NumBytes;
	}

	SDValue PPCTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	if (Subtarget.isAIXABI())
	return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);
	if (Subtarget.is64BitELFABI())
	return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);
	if (Subtarget.is32BitELFABI())
	return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);

	return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
	InVals);
	}

	SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	// 32-bit SVR4 ABI Stack Frame Layout:
	// +-----------------------------------+
	// +--> \| Back chain \|
	// \| +-----------------------------------+
	// \| \| Floating-point register save area \|
	// \| +-----------------------------------+
	// \| \| General register save area \|
	// \| +-----------------------------------+
	// \| \| CR save word \|
	// \| +-----------------------------------+
	// \| \| VRSAVE save word \|
	// \| +-----------------------------------+
	// \| \| Alignment padding \|
	// \| +-----------------------------------+
	// \| \| Vector register save area \|
	// \| +-----------------------------------+
	// \| \| Local variable space \|
	// \| +-----------------------------------+
	// \| \| Parameter list area \|
	// \| +-----------------------------------+
	// \| \| LR save word \|
	// \| +-----------------------------------+
	// SP--> +--- \| Back chain \|
	// +-----------------------------------+
	//
	// Specifications:
	// System V Application Binary Interface PowerPC Processor Supplement
	// AltiVec Technology Programming Interface Manual

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = 4;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// Reserve space for the linkage area on the stack.
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	CCInfo.AllocateStack(LinkageSize, PtrByteSize);
	if (useSoftFloat())
	CCInfo.PreAnalyzeFormalArguments(Ins);

	CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
	CCInfo.clearWasPPCF128();

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	// Arguments stored in registers.
	if (VA.isRegLoc()) {
	const TargetRegisterClass *RC;
	EVT ValVT = VA.getValVT();

	switch (ValVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("ValVT not supported by formal arguments Lowering");
	case MVT::i1:
	case MVT::i32:
	RC = &PPC::GPRCRegClass;
	break;
	case MVT::f32:
	if (Subtarget.hasP8Vector())
	RC = &PPC::VSSRCRegClass;
	else if (Subtarget.hasSPE())
	RC = &PPC::GPRCRegClass;
	else
	RC = &PPC::F4RCRegClass;
	break;
	case MVT::f64:
	if (Subtarget.hasVSX())
	RC = &PPC::VSFRCRegClass;
	else if (Subtarget.hasSPE())
	// SPE passes doubles in GPR pairs.
	RC = &PPC::GPRCRegClass;
	else
	RC = &PPC::F8RCRegClass;
	break;
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	RC = &PPC::VRRCRegClass;
	break;
	case MVT::v4f32:
	RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	RC = &PPC::VRRCRegClass;
	break;
	case MVT::v4f64:
	RC = &PPC::QFRCRegClass;
	break;
	case MVT::v4i1:
	RC = &PPC::QBRCRegClass;
	break;
	}

	SDValue ArgValue;
	// Transform the arguments stored in physical registers into
	// virtual ones.
	if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
	assert(i + 1 < e && "No second half of double precision argument");
	unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
	unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
	SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
	SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
	if (!Subtarget.isLittleEndian())
	std::swap (ArgValueLo, ArgValueHi);
	ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
	ArgValueHi);
	} else {
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
	ValVT == MVT::i1 ? MVT::i32 : ValVT);
	if (ValVT == MVT::i1)
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
	}

	InVals.push_back(ArgValue);
	} else {
	// Argument stored in memory.
	assert(VA.isMemLoc());

	// Get the extended size of the argument type in stack
	unsigned ArgSize = VA.getLocVT().getStoreSize();
	// Get the actual size of the argument type
	unsigned ObjSize = VA.getValVT().getStoreSize();
	unsigned ArgOffset = VA.getLocMemOffset();
	// Stack objects in PPC32 are right justified.
	ArgOffset += ArgSize - ObjSize;
	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(
	DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
	}
	}

	// Assign locations to all of the incoming aggregate by value arguments.
	// Aggregates passed by value are stored in the local variable space of the
	// caller's stack frame, right above the parameter list area.
	SmallVector<CCValAssign, 16> ByValArgLocs;
	CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
	ByValArgLocs, *DAG.getContext());

	// Reserve stack space for the allocations in CCInfo.
	CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);

	CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
	MinReservedArea = std::max(MinReservedArea, LinkageSize);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized function's reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	SmallVector<SDValue, 8> MemOps;

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	static const MCPhysReg GPArgRegs[] = {
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);

	static const MCPhysReg FPArgRegs[] = {
	PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
	PPC::F8
	};
	unsigned NumFPArgRegs = array_lengthof(FPArgRegs);

	if (useSoftFloat() \|\| hasSPE())
	NumFPArgRegs = 0;

	FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
	FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));

	// Make room for NumGPArgRegs and NumFPArgRegs.
	int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
	NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;

	FuncInfo->setVarArgsStackOffset(
	MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
	CCInfo.getNextStackOffset(), true));

	FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// The fixed integer arguments of a variadic function are stored to the
	// VarArgsFrameIndex on the stack so that they may be loaded by
	// dereferencing the result of va_next.
	for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
	// Get an existing live-in vreg, or add a new one.
	unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
	if (!VReg)
	VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}

	// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
	// is set.
	// The double arguments are stored to the VarArgsFrameIndex
	// on the stack.
	for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
	// Get an existing live-in vreg, or add a new one.
	unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
	if (!VReg)
	VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by eight for the next argument to store
	SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
	PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
	EVT ObjectVT, SelectionDAG &DAG,
	SDValue ArgVal,
	const SDLoc &dl) const {
	if (Flags.isSExt())
	ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
	DAG.getValueType(ObjectVT));
	else if (Flags.isZExt())
	ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
	DAG.getValueType(ObjectVT));

	return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
	}

	SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	// TODO: add description of PPC stack frame format, or at least some docs.
	//
	bool isELFv2ABI = Subtarget.isELFv2ABI();
	bool isLittleEndian = Subtarget.isLittleEndian();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	assert(!(CallConv == CallingConv::Fast && isVarArg) &&
	"fastcc not supported on varargs functions");

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = 8;
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned Num_GPR_Regs = array_lengthof(GPR);
	const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
	const unsigned Num_VR_Regs = array_lengthof(VR);
	const unsigned Num_QFPR_Regs = Num_FPR_Regs;

	// Do a first pass over the arguments to determine whether the ABI
	// guarantees that our caller has allocated the parameter save area
	// on its stack frame. In the ELFv1 ABI, this is always the case;
	// in the ELFv2 ABI, it is true if this is a vararg function or if
	// any parameter is located in a stack slot.

	bool HasParameterArea = !isELFv2ABI \|\| isVarArg;
	unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
	unsigned NumBytes = LinkageSize;
	unsigned AvailableFPRs = Num_FPR_Regs;
	unsigned AvailableVRs = Num_VR_Regs;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (Ins[i].Flags.isNest())
	continue;

	if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytes, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	HasParameterArea = true;
	}

	// Add DAG nodes to load the arguments or copy them out of registers. On
	// entry to a function on PPC, the arguments start after the linkage area,
	// although the first ones are often in registers.

	unsigned ArgOffset = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
	unsigned &QFPR_idx = FPR_idx;
	SmallVector<SDValue, 8> MemOps;
	Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
	SDValue ArgVal;
	bool needsLoad = false;
	EVT ObjectVT = Ins[ArgNo].VT;
	EVT OrigVT = Ins[ArgNo].ArgVT;
	unsigned ObjSize = ObjectVT.getStoreSize();
	unsigned ArgSize = ObjSize;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
	if (Ins[ArgNo].isOrigArg()) {
	std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[ArgNo].getOrigArgIndex();
	}
	// We re-align the argument offset for each argument, except when using the
	// fast calling convention, when we need to make sure we do that only when
	// we'll actually use a stack slot.
	unsigned CurArgOffset, Align;
	auto ComputeArgOffset = [&]() {
	/* Respect alignment of argument on the stack. */
	Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
	CurArgOffset = ArgOffset;
	};

	if (CallConv != CallingConv::Fast) {
	ComputeArgOffset();

	/* Compute GPR index associated with argument offset. */
	GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
	}

	// FIXME the codegen can be much improved in some cases.
	// We do not have to keep everything in memory.
	if (Flags.isByVal()) {
	assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");

	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	// ObjSize is the true size, ArgSize rounded up to multiple of registers.
	ObjSize = Flags.getByValSize();
	ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// Empty aggregate parameters do not take up registers. Examples:
	// struct { } a;
	// union { } b;
	// int c[0];
	// etc. However, we have to provide a place-holder in InVals, so
	// pretend we have an 8-byte item at the current address for that
	// purpose.
	if (!ObjSize) {
	int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);
	continue;
	}

	// Create a stack object covering all stack doublewords occupied
	// by the argument. If the argument is (fully or partially) on
	// the stack, or if the argument is fully in registers but the
	// caller has allocated the parameter save anyway, we can refer
	// directly to the caller's stack frame. Otherwise, create a
	// local copy in our own frame.
	int FI;
	if (HasParameterArea \|\|
	ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
	FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
	else
	FI = MFI.CreateStackObject(ArgSize, Align, false);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

	// Handle aggregates smaller than 8 bytes.
	if (ObjSize < PtrByteSize) {
	// The value of the object is its address, which differs from the
	// address of the enclosing doubleword on big-endian systems.
	SDValue Arg = FIN;
	if (!isLittleEndian) {
	SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
	Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
	}
	InVals.push_back(Arg);

	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store;

	if (ObjSize==1 \|\| ObjSize==2 \|\| ObjSize==4) {
	EVT ObjType = (ObjSize == 1 ? MVT::i8 :
	(ObjSize == 2 ? MVT::i16 : MVT::i32));
	Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
	MachinePointerInfo(&*FuncArg), ObjType);
	} else {
	// For sizes that don't fit a truncating store (3, 5, 6, 7),
	// store the whole register as-is to the parameter save area
	// slot.
	Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg));
	}

	MemOps.push_back(Store);
	}
	// Whether we copied from a register or not, advance the offset
	// into the parameter save area by a full doubleword.
	ArgOffset += PtrByteSize;
	continue;
	}

	// The value of the object is its address, which is the address of
	// its first stack doubleword.
	InVals.push_back(FIN);

	// Store whatever pieces of the object are in registers to memory.
	for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
	if (GPR_idx == Num_GPR_Regs)
	break;

	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Addr = FIN;
	if (j) {
	SDValue Off = DAG.getConstant(j, dl, PtrVT);
	Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
	}
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
	MachinePointerInfo(&*FuncArg, j));
	MemOps.push_back(Store);
	++GPR_idx;
	}
	ArgOffset += ArgSize;
	continue;
	}

	switch (ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (Flags.isNest()) {
	// The 'nest' parameter, if any, is passed in R11.
	unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);

	break;
	}

	// These can be scalar arguments or elements of an integer array type
	// passed directly. Clang may use those instead of "byval" aggregate
	// types to avoid forcing arguments to memory unnecessarily.
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += 8;
	break;

	case MVT::f32:
	case MVT::f64:
	// These can be scalar arguments or elements of a float array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// float aggregates.
	if (FPR_idx != Num_FPR_Regs) {
	unsigned VReg;

	if (ObjectVT == MVT::f32)
	VReg = MF.addLiveIn(FPR[FPR_idx],
	Subtarget.hasP8Vector()
	? &PPC::VSSRCRegClass
	: &PPC::F4RCRegClass);
	else
	VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
	? &PPC::VSFRCRegClass
	: &PPC::F8RCRegClass);

	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++FPR_idx;
	} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
	// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
	// once we support fp <-> gpr moves.

	// This can only ever happen in the presence of f32 array types,
	// since otherwise we never run out of FPRs before running out
	// of GPRs.
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::f32) {
	if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
	ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
	DAG.getConstant(32, dl, MVT::i32));
	ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
	}

	ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	}

	// When passing an array of floats, the array occupies consecutive
	// space in the argument area; only round up to the next doubleword
	// at the end of the array. Otherwise, each float takes 8 bytes.
	if (CallConv != CallingConv::Fast \|\| needsLoad) {
	ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
	ArgOffset += ArgSize;
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	case MVT::f128:
	if (!Subtarget.hasQPX()) {
	// These can be scalar arguments or elements of a vector array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// vector aggregates.
	if (VR_idx != Num_VR_Regs) {
	unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++VR_idx;
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();
	needsLoad = true;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += 16;
	break;
	} // not QPX

	assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
	"Invalid QPX parameter type");
	LLVM_FALLTHROUGH;

	case MVT::v4f64:
	case MVT::v4i1:
	// QPX vectors are treated like their scalar floating-point subregisters
	// (except that they're larger).
	unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
	if (QFPR_idx != Num_QFPR_Regs) {
	const TargetRegisterClass *RC;
	switch (ObjectVT.getSimpleVT().SimpleTy) {
	case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
	case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
	default: RC = &PPC::QBRCRegClass; break;
	}

	unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++QFPR_idx;
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();
	needsLoad = true;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += Sz;
	break;
	}

	// We need to load the argument to a virtual register if we determined
	// above that we ran out of physical registers of the appropriate type.
	if (needsLoad) {
	if (ObjSize < ArgSize && !isLittleEndian)
	CurArgOffset += ArgSize - ObjSize;
	int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
	}

	InVals.push_back(ArgVal);
	}

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea;
	if (HasParameterArea)
	MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
	else
	MinReservedArea = LinkageSize;

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized functions' reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	int Depth = ArgOffset;

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrByteSize, Depth, true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// If this function is vararg, store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx < Num_GPR_Regs; ++GPR_idx) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	// TODO: add description of PPC stack frame format, or at least some docs.
	//
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = isPPC64 ? 8 : 4;
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned ArgOffset = LinkageSize;
	// Area that is at least reserved in caller of this function.
	unsigned MinReservedArea = ArgOffset;

	static const MCPhysReg GPR_32[] = { // 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	static const MCPhysReg GPR_64[] = { // 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
	const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
	const unsigned Num_VR_Regs = array_lengthof( VR);

	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;

	const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;

	// In 32-bit non-varargs functions, the stack space for vectors is after the
	// stack space for non-vectors. We do not use this space unless we have
	// too many vectors to fit in registers, something that only occurs in
	// constructed examples:), but we have to walk the arglist to figure
	// that out...for the pathological case, compute VecArgOffset as the
	// start of the vector parameter area. Computing VecArgOffset is the
	// entire point of the following loop.
	unsigned VecArgOffset = ArgOffset;
	if (!isVarArg && !isPPC64) {
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
	++ArgNo) {
	EVT ObjectVT = Ins[ArgNo].VT;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;

	if (Flags.isByVal()) {
	// ObjSize is the true size, ArgSize rounded up to multiple of regs.
	unsigned ObjSize = Flags.getByValSize();
	unsigned ArgSize =
	((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	VecArgOffset += ArgSize;
	continue;
	}

	switch(ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	case MVT::f32:
	VecArgOffset += 4;
	break;
	case MVT::i64: // PPC64
	case MVT::f64:
	// FIXME: We are guaranteed to be !isPPC64 at this point.
	// Does MVT::i64 apply?
	VecArgOffset += 8;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	// Nothing to do, we're only looking at Nonvector args here.
	break;
	}
	}
	}
	// We've found where the vector parameter area in memory is. Skip the
	// first 12 parameters; these don't use that memory.
	VecArgOffset = ((VecArgOffset+15)/16)*16;
	VecArgOffset += 12*16;

	// Add DAG nodes to load the arguments or copy them out of registers. On
	// entry to a function on PPC, the arguments start after the linkage area,
	// although the first ones are often in registers.

	SmallVector<SDValue, 8> MemOps;
	unsigned nAltivecParamsAtEnd = 0;
	Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
	SDValue ArgVal;
	bool needsLoad = false;
	EVT ObjectVT = Ins[ArgNo].VT;
	unsigned ObjSize = ObjectVT.getSizeInBits()/8;
	unsigned ArgSize = ObjSize;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
	if (Ins[ArgNo].isOrigArg()) {
	std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[ArgNo].getOrigArgIndex();
	}
	unsigned CurArgOffset = ArgOffset;

	// Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
	if (ObjectVT==MVT::v4f32 \|\| ObjectVT==MVT::v4i32 \|\|
	ObjectVT==MVT::v8i16 \|\| ObjectVT==MVT::v16i8) {
	if (isVarArg \|\| isPPC64) {
	MinReservedArea = ((MinReservedArea+15)/16)*16;
	MinReservedArea += CalculateStackSlotSize(ObjectVT,
	Flags,
	PtrByteSize);
	} else nAltivecParamsAtEnd++;
	} else
	// Calculate min reserved area.
	MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
	Flags,
	PtrByteSize);

	// FIXME the codegen can be much improved in some cases.
	// We do not have to keep everything in memory.
	if (Flags.isByVal()) {
	assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");

	// ObjSize is the true size, ArgSize rounded up to multiple of registers.
	ObjSize = Flags.getByValSize();
	ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// Objects of size 1 and 2 are right justified, everything else is
	// left justified. This means the memory address is adjusted forwards.
	if (ObjSize==1 \|\| ObjSize==2) {
	CurArgOffset = CurArgOffset + (4 - ObjSize);
	}
	// The value of the object is its address.
	int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);
	if (ObjSize==1 \|\| ObjSize==2) {
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg;
	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
	SDValue Store =
	DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg), ObjType);
	MemOps.push_back(Store);
	++GPR_idx;
	}

	ArgOffset += PtrByteSize;

	continue;
	}
	for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
	// Store whatever pieces of the object are in registers
	// to memory. ArgOffset will be the address of the beginning
	// of the object.
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg;
	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg, j));
	MemOps.push_back(Store);
	++GPR_idx;
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
	break;
	}
	}
	continue;
	}

	switch (ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	if (!isPPC64) {
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);

	if (ObjectVT == MVT::i1)
	ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);

	++GPR_idx;
	} else {
	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	// All int arguments reserve stack space in the Darwin ABI.
	ArgOffset += PtrByteSize;
	break;
	}
	LLVM_FALLTHROUGH;
	case MVT::i64: // PPC64
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);

	++GPR_idx;
	} else {
	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	// All int arguments reserve stack space in the Darwin ABI.
	ArgOffset += 8;
	break;

	case MVT::f32:
	case MVT::f64:
	// Every 4 bytes of argument space consumes one of the GPRs available for
	// argument passing.
	if (GPR_idx != Num_GPR_Regs) {
	++GPR_idx;
	if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
	++GPR_idx;
	}
	if (FPR_idx != Num_FPR_Regs) {
	unsigned VReg;

	if (ObjectVT == MVT::f32)
	VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
	else
	VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);

	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++FPR_idx;
	} else {
	needsLoad = true;
	}

	// All FP arguments reserve stack space in the Darwin ABI.
	ArgOffset += isPPC64 ? 8 : ObjSize;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	// Note that vector arguments in registers don't reserve stack space,
	// except in varargs functions.
	if (VR_idx != Num_VR_Regs) {
	unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	if (isVarArg) {
	while ((ArgOffset % 16) != 0) {
	ArgOffset += PtrByteSize;
	if (GPR_idx != Num_GPR_Regs)
	GPR_idx++;
	}
	ArgOffset += 16;
	GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
	}
	++VR_idx;
	} else {
	if (!isVarArg && !isPPC64) {
	// Vectors go after all the nonvectors.
	CurArgOffset = VecArgOffset;
	VecArgOffset += 16;
	} else {
	// Vectors are aligned.
	ArgOffset = ((ArgOffset+15)/16)*16;
	CurArgOffset = ArgOffset;
	ArgOffset += 16;
	}
	needsLoad = true;
	}
	break;
	}

	// We need to load the argument to a virtual register if we determined above
	// that we ran out of physical registers of the appropriate type.
	if (needsLoad) {
	int FI = MFI.CreateFixedObject(ObjSize,
	CurArgOffset + (ArgSize - ObjSize),
	isImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
	}

	InVals.push_back(ArgVal);
	}

	// Allow for Altivec parameters at the end, if needed.
	if (nAltivecParamsAtEnd) {
	MinReservedArea = ((MinReservedArea+15)/16)*16;
	MinReservedArea += 16*nAltivecParamsAtEnd;
	}

	// Area that is at least reserved in the caller of this function.
	MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized functions' reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	int Depth = ArgOffset;

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
	Depth, true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// If this function is vararg, store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
	unsigned VReg;

	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
	/// adjusted to accommodate the arguments for the tailcall.
	static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
	unsigned ParamSize) {

	if (!isTailCall) return 0;

	PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
	unsigned CallerMinReservedArea = FI->getMinReservedArea();
	int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
	// Remember only if the new adjustment is bigger.
	if (SPDiff < FI->getTailCallSPDelta())
	FI->setTailCallSPDelta(SPDiff);

	return SPDiff;
	}

	static bool isFunctionGlobalAddress(SDValue Callee);

	static bool
	callsShareTOCBase(const Function *Caller, SDValue Callee,
	const TargetMachine &TM) {
	// Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
	// don't have enough information to determine if the caller and calle share
	// the same TOC base, so we have to pessimistically assume they don't for
	// correctness.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G)
	return false;

	const GlobalValue *GV = G->getGlobal();
	// The medium and large code models are expected to provide a sufficiently
	// large TOC to provide all data addressing needs of a module with a
	// single TOC. Since each module will be addressed with a single TOC then we
	// only need to check that caller and callee don't cross dso boundaries.
	if (CodeModel::Medium == TM.getCodeModel() \|\|
	CodeModel::Large == TM.getCodeModel())
	return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);

	// Otherwise we need to ensure callee and caller are in the same section,
	// since the linker may allocate multiple TOCs, and we don't know which
	// sections will belong to the same TOC base.

	if (!GV->isStrongDefinitionForLinker())
	return false;

	// Any explicitly-specified sections and section prefixes must also match.
	// Also, if we're using -ffunction-sections, then each function is always in
	// a different section (the same is true for COMDAT functions).
	if (TM.getFunctionSections() \|\| GV->hasComdat() \|\| Caller->hasComdat() \|\|
	GV->getSection() != Caller->getSection())
	return false;
	if (const auto *F = dyn_cast<Function>(GV)) {
	if (F->getSectionPrefix() != Caller->getSectionPrefix())
	return false;
	}

	// If the callee might be interposed, then we can't assume the ultimate call
	// target will be in the same section. Even in cases where we can assume that
	// interposition won't happen, in any case where the linker might insert a
	// stub to allow for interposition, we must generate code as though
	// interposition might occur. To understand why this matters, consider a
	// situation where: a -> b -> c where the arrows indicate calls. b and c are
	// in the same section, but a is in a different module (i.e. has a different
	// TOC base pointer). If the linker allows for interposition between b and c,
	// then it will generate a stub for the call edge between b and c which will
	// save the TOC pointer into the designated stack slot allocated by b. If we
	// return true here, and therefore allow a tail call between b and c, that
	// stack slot won't exist and the b -> c stub will end up saving b'c TOC base
	// pointer into the stack slot allocated by a (where the a -> b stub saved
	// a's TOC base pointer). If we're not considering a tail call, but rather,
	// whether a nop is needed after the call instruction in b, because the linker
	// will insert a stub, it might complain about a missing nop if we omit it
	// (although many don't complain in this case).
	if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
	return false;

	return true;
	}

	static bool
	needStackSlotPassParameters(const PPCSubtarget &Subtarget,
	const SmallVectorImpl<ISD::OutputArg> &Outs) {
	assert(Subtarget.is64BitELFABI());

	const unsigned PtrByteSize = 8;
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned NumGPRs = array_lengthof(GPR);
	const unsigned NumFPRs = 13;
	const unsigned NumVRs = array_lengthof(VR);
	const unsigned ParamAreaSize = NumGPRs * PtrByteSize;

	unsigned NumBytes = LinkageSize;
	unsigned AvailableFPRs = NumFPRs;
	unsigned AvailableVRs = NumVRs;

	for (const ISD::OutputArg& Param : Outs) {
	if (Param.Flags.isNest()) continue;

	if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytes, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	return true;
	}
	return false;
	}

	static bool
	hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
	if (CS.arg_size() != CallerFn->arg_size())
	return false;

	ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
	ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
	Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();

	for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
	const Value* CalleeArg = *CalleeArgIter;
	const Value* CallerArg = &(*CallerArgIter);
	if (CalleeArg == CallerArg)
	continue;

	// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
	// tail call @callee([4 x i64] undef, [4 x i64] %b)
	// }
	// 1st argument of callee is undef and has the same type as caller.
	if (CalleeArg->getType() == CallerArg->getType() &&
	isa<UndefValue>(CalleeArg))
	continue;

	return false;
	}

	return true;
	}

	// Returns true if TCO is possible between the callers and callees
	// calling conventions.
	static bool
	areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
	CallingConv::ID CalleeCC) {
	// Tail calls are possible with fastcc and ccc.
	auto isTailCallableCC = [] (CallingConv::ID CC){
	return CC == CallingConv::C \|\| CC == CallingConv::Fast;
	};
	if (!isTailCallableCC(CallerCC) \|\| !isTailCallableCC(CalleeCC))
	return false;

	// We can safely tail call both fastcc and ccc callees from a c calling
	// convention caller. If the caller is fastcc, we may have less stack space
	// than a non-fastcc caller with the same signature so disable tail-calls in
	// that case.
	return CallerCC == CallingConv::C \|\| CallerCC == CalleeCC;
	}

	bool
	PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
	SDValue Callee,
	CallingConv::ID CalleeCC,
	ImmutableCallSite CS,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const {
	bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;

	if (DisableSCO && !TailCallOpt) return false;

	// Variadic argument functions are not supported.
	if (isVarArg) return false;

	auto &Caller = DAG.getMachineFunction().getFunction();
	// Check that the calling conventions are compatible for tco.
	if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
	return false;

	// Caller contains any byval parameter is not supported.
	if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
	return false;

	// Callee contains any byval parameter is not supported, too.
	// Note: This is a quick work around, because in some cases, e.g.
	// caller's stack size > callee's stack size, we are still able to apply
	// sibling call optimization. For example, gcc is able to do SCO for caller1
	// in the following example, but not for caller2.
	// struct test {
	// long int a;
	// char ary[56];
	// } gTest;
	// __attribute__((noinline)) int callee(struct test v, struct test *b) {
	// b->a = v.a;
	// return 0;
	// }
	// void caller1(struct test a, struct test c, struct test *b) {
	// callee(gTest, b); }
	// void caller2(struct test *b) { callee(gTest, b); }
	if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
	return false;

	// If callee and caller use different calling conventions, we cannot pass
	// parameters on stack since offsets for the parameter area may be different.
	if (Caller.getCallingConv() != CalleeCC &&
	needStackSlotPassParameters(Subtarget, Outs))
	return false;

	// No TCO/SCO on indirect call because Caller have to restore its TOC
	if (!isFunctionGlobalAddress(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee))
	return false;

	// If the caller and callee potentially have different TOC bases then we
	// cannot tail call since we need to restore the TOC pointer after the call.
	// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
	if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
	return false;

	// TCO allows altering callee ABI, so we don't have to check further.
	if (CalleeCC == CallingConv::Fast && TailCallOpt)
	return true;

	if (DisableSCO) return false;

	// If callee use the same argument list that caller is using, then we can
	// apply SCO on this case. If it is not, then we need to check if callee needs
	// stack for passing arguments.
	if (!hasSameArgumentList(&Caller, CS) &&
	needStackSlotPassParameters(Subtarget, Outs)) {
	return false;
	}

	return true;
	}

	/// IsEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization. Targets which want to do tail call
	/// optimization should implement this function.
	bool
	PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const {
	if (!getTargetMachine().Options.GuaranteedTailCallOpt)
	return false;

	// Variable argument functions are not supported.
	if (isVarArg)
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
	if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
	// Functions containing by val parameters are not supported.
	for (unsigned i = 0; i != Ins.size(); i++) {
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	if (Flags.isByVal()) return false;
	}

	// Non-PIC/GOT tail calls are supported.
	if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
	return true;

	// At the moment we can only do local tail calls (in same module, hidden
	// or protected) if we are generating PIC.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	return G->getGlobal()->hasHiddenVisibility()
	\|\| G->getGlobal()->hasProtectedVisibility();
	}

	return false;
	}

	/// isCallCompatibleAddress - Return the immediate to use if the specified
	/// 32-bit value is representable in the immediate field of a BxA instruction.
	static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C) return nullptr;

	int Addr = C->getZExtValue();
	if ((Addr & 3) != 0 \|\| // Low 2 bits are implicitly zero.
	SignExtend32<26>(Addr) != Addr)
	return nullptr; // Top 6 bits have to be sext of immediate.

	return DAG
	.getConstant(
	(int)C->getZExtValue() >> 2, SDLoc(Op),
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
	.getNode();
	}

	namespace {

	struct TailCallArgumentInfo {
	SDValue Arg;
	SDValue FrameIdxOp;
	int FrameIdx = 0;

	TailCallArgumentInfo() = default;
	};

	} // end anonymous namespace

	/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
	static void StoreTailCallArgumentsToStackSlot(
	SelectionDAG &DAG, SDValue Chain,
	const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
	SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
	for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
	SDValue Arg = TailCallArgs[i].Arg;
	SDValue FIN = TailCallArgs[i].FrameIdxOp;
	int FI = TailCallArgs[i].FrameIdx;
	// Store relative to framepointer.
	MemOpChains.push_back(DAG.getStore(
	Chain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
	/// the appropriate stack slot for the tail call optimized function call.
	static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
	SDValue OldRetAddr, SDValue OldFP,
	int SPDiff, const SDLoc &dl) {
	if (SPDiff) {
	// Calculate the new stack slot for the return address.
	MachineFunction &MF = DAG.getMachineFunction();
	const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
	const PPCFrameLowering *FL = Subtarget.getFrameLowering();
	bool isPPC64 = Subtarget.isPPC64();
	int SlotSize = isPPC64 ? 8 : 4;
	int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
	int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
	NewRetAddrLoc, true);
	EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
	Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(MF, NewRetAddr));

	// When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
	// slot as the FP is never overwritten.
	if (Subtarget.isDarwinABI()) {
	int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
	int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
	true);
	SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
	Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewFPIdx));
	}
	}
	return Chain;
	}

	/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
	/// the position of the argument.
	static void
	CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
	SDValue Arg, int SPDiff, unsigned ArgOffset,
	SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
	int Offset = ArgOffset + SPDiff;
	uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
	SDValue FIN = DAG.getFrameIndex(FI, VT);
	TailCallArgumentInfo Info;
	Info.Arg = Arg;
	Info.FrameIdxOp = FIN;
	Info.FrameIdx = FI;
	TailCallArguments.push_back(Info);
	}

	/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
	/// stack slot. Returns the chain as result and the loaded frame pointers in
	/// LROpOut/FPOpout. Used when tail calling.
	SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
	SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
	SDValue &FPOpOut, const SDLoc &dl) const {
	if (SPDiff) {
	// Load the LR and FP stack slot for later adjusting.
	EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	LROpOut = getReturnAddrFrameIndex(DAG);
	LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
	Chain = SDValue(LROpOut.getNode(), 1);

	// When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
	// slot as the FP is never overwritten.
	if (Subtarget.isDarwinABI()) {
	FPOpOut = getFramePointerFrameIndex(DAG);
	FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
	Chain = SDValue(FPOpOut.getNode(), 1);
	}
	}
	return Chain;
	}

	/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
	/// by "Src" to address "Dst" of size "Size". Alignment information is
	/// specified by the specific parameter attribute. The copy will be passed as
	/// a byval function parameter.
	/// Sometimes what we are copying is the end of a larger object, the part that
	/// does not fit in registers.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	false, false, false, MachinePointerInfo(),
	MachinePointerInfo());
	}

	/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
	/// tail calls.
	static void LowerMemOpCallTo(
	SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
	SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
	bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
	SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
	EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	if (!isTailCall) {
	if (isVector) {
	SDValue StackPtr;
	if (isPPC64)
	StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
	else
	StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
	DAG.getConstant(ArgOffset, dl, PtrVT));
	}
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	// Calculate and remember argument location.
	} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
	TailCallArguments);
	}

	static void
	PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
	const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
	SDValue FPOp,
	SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
	// Emit a sequence of copyto/copyfrom virtual registers for arguments that
	// might overwrite each other in case of tail call optimization.
	SmallVector<SDValue, 8> MemOpChains2;
	// Do not flag preceding copytoreg stuff together with the following stuff.
	InFlag = SDValue();
	StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
	MemOpChains2, dl);
	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);

	// Emit callseq_end just before tailcall node.
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Is this global address that of a function that can be called by name? (as
	// opposed to something that must hold a descriptor for an indirect call).
	static bool isFunctionGlobalAddress(SDValue Callee) {
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	if (Callee.getOpcode() == ISD::GlobalTLSAddress \|\|
	Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;

	return G->getGlobal()->getValueType()->isFunctionTy();
	}

	return false;
	}

	SDValue PPCTargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	CCRetInfo.AnalyzeCallResult(
	Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
	? RetCC_PPC_Cold
	: RetCC_PPC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Val;

	if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
	SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
	InFlag);
	Chain = Lo.getValue(1);
	InFlag = Lo.getValue(2);
	VA = RVLocs[++i]; // skip ahead to next loc
	SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
	InFlag);
	Chain = Hi.getValue(1);
	InFlag = Hi.getValue(2);
	if (!Subtarget.isLittleEndian())
	std::swap (Lo, Hi);
	Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
	} else {
	Val = DAG.getCopyFromReg(Chain, dl,
	VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);
	}

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
	const PPCSubtarget &Subtarget, bool isPatchPoint) {
	// PatchPoint calls are not indirect.
	if (isPatchPoint)
	return false;

	if (isFunctionGlobalAddress(Callee) \|\| dyn_cast<ExternalSymbolSDNode>(Callee))
	return false;

	// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
	// becuase the immediate function pointer points to a descriptor instead of
	// a function entry point. The ELFv2 ABI cannot use a BLA because the function
	// pointer immediate points to the global entry point, while the BLA would
	// need to jump to the local entry point (see rL211174).
	if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
	isBLACompatibleAddress(Callee, DAG))
	return false;

	return true;
	}

	static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,
	bool isTailCall, const Function &Caller,
	const SDValue &Callee,
	const PPCSubtarget &Subtarget,
	const TargetMachine &TM) {
	if (isTailCall)
	return PPCISD::TC_RETURN;

	// This is a call through a function pointer.
	if (isIndirectCall) {
	// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
	// indirect calls. The save of the caller's TOC pointer to the stack will be
	// inserted into the DAG as part of call lowering. The restore of the TOC
	// pointer is modeled by using a pseudo instruction for the call opcode that
	// represents the 2 instruction sequence of an indirect branch and link,
	// immediately followed by a load of the TOC pointer from the the stack save
	// slot into gpr2.
	if (Subtarget.isAIXABI() \|\| Subtarget.is64BitELFABI())
	return PPCISD::BCTRL_LOAD_TOC;

	// An indirect call that does not need a TOC restore.
	return PPCISD::BCTRL;
	}

	// The ABIs that maintain a TOC pointer accross calls need to have a nop
	// immediately following the call instruction if the caller and callee may
	// have different TOC bases. At link time if the linker determines the calls
	// may not share a TOC base, the call is redirected to a trampoline inserted
	// by the linker. The trampoline will (among other things) save the callers
	// TOC pointer at an ABI designated offset in the linkage area and the linker
	// will rewrite the nop to be a load of the TOC pointer from the linkage area
	// into gpr2.
	if (Subtarget.isAIXABI() \|\| Subtarget.is64BitELFABI())
	return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
	: PPCISD::CALL_NOP;

	return PPCISD::CALL;
	}

	static bool isValidAIXExternalSymSDNode(StringRef SymName) {
	return StringSwitch<bool>(SymName)
	.Cases("__divdi3", "__fixunsdfdi", "__floatundidf", "__floatundisf",
	"__moddi3", "__udivdi3", "__umoddi3", true)
	.Cases("ceil", "floor", "memcpy", "memmove", "memset", "round", true)
	.Default(false);
	}

	static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
	const SDLoc &dl, const PPCSubtarget &Subtarget) {
	if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
	if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
	return SDValue(Dest, 0);

	// Returns true if the callee is local, and false otherwise.
	auto isLocalCallee = [&]() {
	const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	const GlobalValue *GV = G ? G->getGlobal() : nullptr;

	return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
	!dyn_cast_or_null<GlobalIFunc>(GV);
	};

	// The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
	// a static relocation model causes some versions of GNU LD (2.17.50, at
	// least) to force BSS-PLT, instead of secure-PLT, even if all objects are
	// built with secure-PLT.
	bool UsePlt =
	Subtarget.is32BitELFABI() && !isLocalCallee() &&
	Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;

	// On AIX, direct function calls reference the symbol for the function's
	// entry point, which is named by prepending a "." before the function's
	// C-linkage name.
	const auto getAIXFuncEntryPointSymbolSDNode =
	[&](StringRef FuncName, bool IsDeclaration,
	const XCOFF::StorageClass &SC) {
	auto &Context = DAG.getMachineFunction().getMMI().getContext();

	MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
	Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));

	if (IsDeclaration && !S->hasContainingCsect()) {
	// On AIX, an undefined symbol needs to be associated with a
	// MCSectionXCOFF to get the correct storage mapping class.
	// In this case, XCOFF::XMC_PR.
	MCSectionXCOFF *Sec = Context.getXCOFFSection(
	S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
	SectionKind::getMetadata());
	S->setContainingCsect(Sec);
	}

	MVT PtrVT =
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	return DAG.getMCSymbol(S, PtrVT);
	};

	if (isFunctionGlobalAddress(Callee)) {
	const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
	const GlobalValue *GV = G->getGlobal();

	if (!Subtarget.isAIXABI())
	return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
	UsePlt ? PPCII::MO_PLT : 0);

	assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
	const GlobalObject *GO = cast<GlobalObject>(GV);
	const XCOFF::StorageClass SC =
	TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
	return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(),
	SC);
	}

	if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const char *SymName = S->getSymbol();
	if (!Subtarget.isAIXABI())
	return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
	UsePlt ? PPCII::MO_PLT : 0);

	// If there exists a user-declared function whose name is the same as the
	// ExternalSymbol's, then we pick up the user-declared version.
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	if (const Function *F =
	dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) {
	const XCOFF::StorageClass SC =
	TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F);
	return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(),
	SC);
	}

	// TODO: Remove this when the support for ExternalSymbolSDNode is complete.
	if (isValidAIXExternalSymSDNode(SymName)) {
	return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
	}

	report_fatal_error("Unexpected ExternalSymbolSDNode: " + Twine(SymName));
	}

	// No transformation needed.
	assert(Callee.getNode() && "What no callee?");
	return Callee;
	}

	static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
	assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
	"Expected a CALLSEQ_STARTSDNode.");

	// The last operand is the chain, except when the node has glue. If the node
	// has glue, then the last operand is the glue, and the chain is the second
	// last operand.
	SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);
	if (LastValue.getValueType() != MVT::Glue)
	return LastValue;

	return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);
	}

	// Creates the node that moves a functions address into the count register
	// to prepare for an indirect call instruction.
	static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
	SDValue &Glue, SDValue &Chain,
	const SDLoc &dl) {
	SDValue MTCTROps[] = {Chain, Callee, Glue};
	EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
	Chain = DAG.getNode(PPCISD::MTCTR, dl, makeArrayRef(ReturnTypes, 2),
	makeArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
	// The glue is the second value produced.
	Glue = Chain.getValue(1);
	}

	static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
	SDValue &Glue, SDValue &Chain,
	SDValue CallSeqStart,
	ImmutableCallSite CS, const SDLoc &dl,
	bool hasNest,
	const PPCSubtarget &Subtarget) {
	// Function pointers in the 64-bit SVR4 ABI do not point to the function
	// entry point, but to the function descriptor (the function entry point
	// address is part of the function descriptor though).
	// The function descriptor is a three doubleword structure with the
	// following fields: function entry point, TOC base address and
	// environment pointer.
	// Thus for a call through a function pointer, the following actions need
	// to be performed:
	// 1. Save the TOC of the caller in the TOC save area of its stack
	// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
	// 2. Load the address of the function entry point from the function
	// descriptor.
	// 3. Load the TOC of the callee from the function descriptor into r2.
	// 4. Load the environment pointer from the function descriptor into
	// r11.
	// 5. Branch to the function entry point address.
	// 6. On return of the callee, the TOC of the caller needs to be
	// restored (this is done in FinishCall()).
	//
	// The loads are scheduled at the beginning of the call sequence, and the
	// register copies are flagged together to ensure that no other
	// operations can be scheduled in between. E.g. without flagging the
	// copies together, a TOC access in the caller could be scheduled between
	// the assignment of the callee TOC and the branch to the callee, which leads
	// to incorrect code.

	// Start by loading the function address from the descriptor.
	SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
	auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
	? (MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant)
	: MachineMemOperand::MONone;

	MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);

	// Registers used in building the DAG.
	const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
	const MCRegister TOCReg = Subtarget.getTOCPointerRegister();

	// Offsets of descriptor members.
	const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
	const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();

	const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4;

	// One load for the functions entry point address.
	SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,
	Alignment, MMOFlags);

	// One for loading the TOC anchor for the module that contains the called
	// function.
	SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);
	SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);
	SDValue TOCPtr =
	DAG.getLoad(RegVT, dl, LDChain, AddTOC,
	MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);

	// One for loading the environment pointer.
	SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);
	SDValue LoadEnvPtr =
	DAG.getLoad(RegVT, dl, LDChain, AddPtr,
	MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);


	// Then copy the newly loaded TOC anchor to the TOC pointer.
	SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);
	Chain = TOCVal.getValue(0);
	Glue = TOCVal.getValue(1);

	// If the function call has an explicit 'nest' parameter, it takes the
	// place of the environment pointer.
	assert((!hasNest \|\| !Subtarget.isAIXABI()) &&
	"Nest parameter is not supported on AIX.");
	if (!hasNest) {
	SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);
	Chain = EnvVal.getValue(0);
	Glue = EnvVal.getValue(1);
	}

	// The rest of the indirect call sequence is the same as the non-descriptor
	// DAG.
	prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);
	}

	static void
	buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
	const SDLoc &dl, bool isTailCall, bool isVarArg,
	bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
	SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
	const PPCSubtarget &Subtarget, bool isIndirect) {
	const bool IsPPC64 = Subtarget.isPPC64();
	// MVT for a general purpose register.
	const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

	// First operand is always the chain.
	Ops.push_back(Chain);

	// If it's a direct call pass the callee as the second operand.
	if (!isIndirect)
	Ops.push_back(Callee);
	else {
	assert(!isPatchPoint && "Patch point call are not indirect.");

	// For the TOC based ABIs, we have saved the TOC pointer to the linkage area
	// on the stack (this would have been done in `LowerCall_64SVR4` or
	// `LowerCall_AIX`). The call instruction is a pseudo instruction that
	// represents both the indirect branch and a load that restores the TOC
	// pointer from the linkage area. The operand for the TOC restore is an add
	// of the TOC save offset to the stack pointer. This must be the second
	// operand: after the chain input but before any other variadic arguments.
	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) {
	const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();

	SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);
	Ops.push_back(AddTOC);
	}

	// Add the register used for the environment pointer.
	if (Subtarget.usesFunctionDescriptors() && !hasNest)
	Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
	RegVT));


	// Add CTR register as callee so a bctr can be emitted later.
	if (isTailCall)
	Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
	}

	// If this is a tail call add stack pointer delta.
	if (isTailCall)
	Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
	// no way to mark dependencies as implicit here.
	// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
	if ((Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI()) && !isPatchPoint)
	Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));

	// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
	if (isVarArg && Subtarget.is32BitELFABI())
	Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask =
	TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	// If the glue is valid, it is the last operand.
	if (Glue.getNode())
	Ops.push_back(Glue);
	}

	SDValue PPCTargetLowering::FinishCall(
	CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
	bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
	SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
	unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
	SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {

	if (Subtarget.is64BitELFABI() \|\| Subtarget.isAIXABI())
	setUsesTOCBasePtr(DAG);

	const bool isIndirect = isIndirectCall(Callee, DAG, Subtarget, isPatchPoint);
	unsigned CallOpc = getCallOpcode(isIndirect, isPatchPoint, isTailCall,
	DAG.getMachineFunction().getFunction(),
	Callee, Subtarget, DAG.getTarget());

	if (!isIndirect)
	Callee = transformCallee(Callee, DAG, dl, Subtarget);
	else if (Subtarget.usesFunctionDescriptors())
	prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CS,
	dl, hasNest, Subtarget);
	else
	prepareIndirectCall(DAG, Callee, Glue, Chain, dl);

	// Build the operand list for the call instruction.
	SmallVector<SDValue, 8> Ops;
	buildCallOperands(Ops, CallConv, dl, isTailCall, isVarArg, isPatchPoint,
	hasNest, DAG, RegsToPass, Glue, Chain, Callee, SPDiff,
	Subtarget, isIndirect);

	// Emit tail call.
	if (isTailCall) {
	assert(((Callee.getOpcode() == ISD::Register &&
	cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) \|\|
	Callee.getOpcode() == ISD::TargetExternalSymbol \|\|
	Callee.getOpcode() == ISD::TargetGlobalAddress \|\|
	isa<ConstantSDNode>(Callee)) &&
	"Expecting a global address, external symbol, absolute value or "
	"register");
	assert(CallOpc == PPCISD::TC_RETURN &&
	"Unexpected call opcode for a tail call.");
	DAG.getMachineFunction().getFrameInfo().setHasTailCall();
	return DAG.getNode(CallOpc, dl, MVT::Other, Ops);
	}

	std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
	Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
	Glue = Chain.getValue(1);

	// When performing tail call optimization the callee pops its arguments off
	// the stack. Account for this here so these bytes can be pushed back on in
	// PPCFrameLowering::eliminateCallFramePseudoInstr.
	int BytesCalleePops = (CallConv == CallingConv::Fast &&
	getTargetMachine().Options.GuaranteedTailCallOpt)
	? NumBytes
	: 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(BytesCalleePops, dl, true),
	Glue, dl);
	Glue = Chain.getValue(1);

	return LowerCallResult(Chain, Glue, CallConv, isVarArg, Ins, dl, DAG, InVals);
	}

	SDValue
	PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &isTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool isVarArg = CLI.IsVarArg;
	bool isPatchPoint = CLI.IsPatchPoint;
	ImmutableCallSite CS = CLI.CS;

	if (isTailCall) {
	if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
	isTailCall = false;
	else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
	isTailCall =
	IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
	isVarArg, Outs, Ins, DAG);
	else
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
	Ins, DAG);
	if (isTailCall) {
	++NumTailCalls;
	if (!getTargetMachine().Options.GuaranteedTailCallOpt)
	++NumSiblingCalls;

	assert(isa<GlobalAddressSDNode>(Callee) &&
	"Callee should be an llvm::Function object.");
	LLVM_DEBUG(
	const GlobalValue *GV =
	cast<GlobalAddressSDNode>(Callee)->getGlobal();
	const unsigned Width =
	80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
	dbgs() << "TCO caller: "
	<< left_justify(DAG.getMachineFunction().getName(), Width)
	<< ", callee linkage: " << GV->getVisibility() << ", "
	<< GV->getLinkage() << "\n");
	}
	}

	if (!isTailCall && CS && CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// When long calls (i.e. indirect calls) are always used, calls are always
	// made via function pointer. If we have a function name, first translate it
	// into a pointer.
	if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
	!isTailCall)
	Callee = LowerGlobalAddress(Callee, DAG);

	if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
	return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);

	if (Subtarget.isSVR4ABI())
	return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);

	if (Subtarget.isAIXABI())
	return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);

	return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);
	}

	SDValue PPCTargetLowering::LowerCall_32SVR4(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {
	// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
	// of the 32-bit SVR4 ABI stack frame layout.

	assert((CallConv == CallingConv::C \|\|
	CallConv == CallingConv::Cold \|\|
	CallConv == CallingConv::Fast) && "Unknown calling convention!");

	unsigned PtrByteSize = 4;

	MachineFunction &MF = DAG.getMachineFunction();

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, parameter list area and the part of the local variable space which
	// contains copies of aggregates which are passed by value.

	// Assign locations to all of the outgoing arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Reserve space for the linkage area on the stack.
	CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
	PtrByteSize);
	if (useSoftFloat())
	CCInfo.PreAnalyzeCallOperands(Outs);

	if (isVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Fixed vector arguments go into registers as long as registers are
	// available. Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	bool Result;

	if (Outs[i].IsFixed) {
	Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
	CCInfo);
	} else {
	Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo);
	}

	if (Result) {
	#ifndef NDEBUG
	errs() << "Call operand #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << "\n";
	#endif
	llvm_unreachable(nullptr);
	}
	}
	} else {
	// All arguments are treated the same.
	CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
	}
	CCInfo.clearWasPPCF128();

	// Assign locations to all of the outgoing aggregate by value arguments.
	SmallVector<CCValAssign, 16> ByValArgLocs;
	CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());

	// Reserve stack space for the allocations in CCInfo.
	CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);

	CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);

	// Size of the linkage area, parameter list area and the part of the local
	// space variable where copies of aggregates which are passed by value are
	// stored.
	unsigned NumBytes = CCByValInfo.getNextStackOffset();

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be moved somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
	SmallVector<SDValue, 8> MemOpChains;

	bool seenFloatArg = false;
	// Walk the register/memloc assignments, inserting copies/loads.
	// i - Tracks the index into the list of registers allocated for the call
	// RealArgIdx - Tracks the index into the list of actual function arguments
	// j - Tracks the index into the list of byval arguments
	for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
	i != e;
	++i, ++RealArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[RealArgIdx];
	ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;

	if (Flags.isByVal()) {
	// Argument is an aggregate which is passed by value, thus we need to
	// create a copy of it in the local variable space of the current stack
	// frame (which is the stack frame of the caller) and pass the address of
	// this copy to the callee.
	assert((j < ByValArgLocs.size()) && "Index out of bounds!");
	CCValAssign &ByValVA = ByValArgLocs[j++];
	assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");

	// Memory reserved in the local variable space of the callers stack frame.
	unsigned LocMemOffset = ByValVA.getLocMemOffset();

	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
	StackPtr, PtrOff);

	// Create a copy of the argument in the local area of the current
	// stack frame.
	SDValue MemcpyCall =
	CreateCopyOfByValArgument(Arg, PtrOff,
	CallSeqStart.getNode()->getOperand(0),
	Flags, DAG, dl);

	// This must go outside the CALLSEQ_START..END.
	SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
	SDLoc(MemcpyCall));
	DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
	NewCallSeqStart.getNode());
	Chain = CallSeqStart = NewCallSeqStart;

	// Pass the address of the aggregate copy on the stack either in a
	// physical register or in the parameter list area of the current stack
	// frame to the callee.
	Arg = PtrOff;
	}

	// When useCRBits() is true, there can be i1 arguments.
	// It is because getRegisterType(MVT::i1) => MVT::i1,
	// and for other integer types getRegisterType() => MVT::i32.
	// Extend i1 and ensure callee will get i32.
	if (Arg.getValueType() == MVT::i1)
	Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
	dl, MVT::i32, Arg);

	if (VA.isRegLoc()) {
	seenFloatArg \|= VA.getLocVT().isFloatingPoint();
	// Put argument in a physical register.
	if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
	bool IsLE = Subtarget.isLittleEndian();
	SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
	SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
	RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
	SVal.getValue(0)));
	} else
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	// Put argument in the parameter list area of the current stack frame.
	assert(VA.isMemLoc());
	unsigned LocMemOffset = VA.getLocMemOffset();

	if (!isTailCall) {
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
	StackPtr, PtrOff);

	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	} else {
	// Calculate and remember argument location.
	CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
	TailCallArguments);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// Set CR bit 6 to true if this is a vararg call with floating args passed in
	// registers.
	if (isVarArg) {
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, InFlag };

	Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
	dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));

	InFlag = Chain.getValue(1);
	}

	if (isTailCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
	/* unused except on PPC64 ELFv1 */ false, DAG,
	RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
	NumBytes, Ins, InVals, CS);
	}

	// Copy an argument into memory, being careful to do this outside the
	// call sequence for the call to which the argument belongs.
	SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
	SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) const {
	SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
	CallSeqStart.getNode()->getOperand(0),
	Flags, DAG, dl);
	// The MEMCPY must go outside the CALLSEQ_START..END.
	int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
	SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
	SDLoc(MemcpyCall));
	DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
	NewCallSeqStart.getNode());
	return NewCallSeqStart;
	}

	SDValue PPCTargetLowering::LowerCall_64SVR4(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {
	bool isELFv2ABI = Subtarget.isELFv2ABI();
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned NumOps = Outs.size();
	bool hasNest = false;
	bool IsSibCall = false;

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	unsigned PtrByteSize = 8;

	MachineFunction &MF = DAG.getMachineFunction();

	if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
	IsSibCall = true;

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	assert(!(CallConv == CallingConv::Fast && isVarArg) &&
	"fastcc not supported on varargs functions");

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
	// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
	// area is 32 bytes reserved space for [SP][CR][LR][TOC].
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned NumBytes = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
	unsigned &QFPR_idx = FPR_idx;

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned NumGPRs = array_lengthof(GPR);
	const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
	const unsigned NumVRs = array_lengthof(VR);
	const unsigned NumQFPRs = NumFPRs;

	// On ELFv2, we can avoid allocating the parameter area if all the arguments
	// can be passed to the callee in registers.
	// For the fast calling convention, there is another check below.
	// Note: We should keep consistent with LowerFormalArguments_64SVR4()
	bool HasParameterArea = !isELFv2ABI \|\| isVarArg \|\| CallConv == CallingConv::Fast;
	if (!HasParameterArea) {
	unsigned ParamAreaSize = NumGPRs * PtrByteSize;
	unsigned AvailableFPRs = NumFPRs;
	unsigned AvailableVRs = NumVRs;
	unsigned NumBytesTmp = NumBytes;
	for (unsigned i = 0; i != NumOps; ++i) {
	if (Outs[i].Flags.isNest()) continue;
	if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytesTmp, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	HasParameterArea = true;
	}
	}

	// When using the fast calling convention, we don't provide backing for
	// arguments that will be in registers.
	unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;

	// Avoid allocating parameter area for fastcc functions if all the arguments
	// can be passed in the registers.
	if (CallConv == CallingConv::Fast)
	HasParameterArea = false;

	// Add up all the space actually used.
	for (unsigned i = 0; i != NumOps; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	EVT OrigVT = Outs[i].ArgVT;

	if (Flags.isNest())
	continue;

	if (CallConv == CallingConv::Fast) {
	if (Flags.isByVal()) {
	NumGPRsUsed += (Flags.getByValSize()+7)/8;
	if (NumGPRsUsed > NumGPRs)
	HasParameterArea = true;
	} else {
	switch (ArgVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (++NumGPRsUsed <= NumGPRs)
	continue;
	break;
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	case MVT::f128:
	if (++NumVRsUsed <= NumVRs)
	continue;
	break;
	case MVT::v4f32:
	// When using QPX, this is handled like a FP register, otherwise, it
	// is an Altivec register.
	if (Subtarget.hasQPX()) {
	if (++NumFPRsUsed <= NumFPRs)
	continue;
	} else {
	if (++NumVRsUsed <= NumVRs)
	continue;
	}
	break;
	case MVT::f32:
	case MVT::f64:
	case MVT::v4f64: // QPX
	case MVT::v4i1: // QPX
	if (++NumFPRsUsed <= NumFPRs)
	continue;
	break;
	}
	HasParameterArea = true;
	}
	}

	/* Respect alignment of argument on the stack. */
	unsigned Align =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	NumBytes = ((NumBytes + Align - 1) / Align) * Align;

	NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	if (Flags.isInConsecutiveRegsLast())
	NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}

	unsigned NumBytesActuallyUsed = NumBytes;

	// In the old ELFv1 ABI,
	// the prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if its varargs.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	// In the ELFv2 ABI, we allocate the parameter area iff a callee
	// really requires memory operands, e.g. a vararg function.
	if (HasParameterArea)
	NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
	else
	NumBytes = LinkageSize;

	// Tail call needs the stack to be aligned.
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

	int SPDiff = 0;

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	if (!IsSibCall)
	SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

	// To protect arguments on the stack from being clobbered in a tail call,
	// force all the loads to happen before doing any other lowering.
	if (isTailCall)
	Chain = DAG.getStackArgumentTokenFactor(Chain);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be move somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);

	// Figure out which arguments are going to go in registers, and which in
	// memory. Also, if this is a vararg function, floating point operations
	// must be stored to our stack, and loaded into integer regs as well, if
	// any integer regs are available for argument passing.
	unsigned ArgOffset = LinkageSize;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;

	SmallVector<SDValue, 8> MemOpChains;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	EVT OrigVT = Outs[i].ArgVT;

	// PtrOff will be used to store the current argument to the stack if a
	// register cannot be found for it.
	SDValue PtrOff;

	// We re-align the argument offset for each argument, except when using the
	// fast calling convention, when we need to make sure we do that only when
	// we'll actually use a stack slot.
	auto ComputePtrOff = [&]() {
	/* Respect alignment of argument on the stack. */
	unsigned Align =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;

	PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	};

	if (CallConv != CallingConv::Fast) {
	ComputePtrOff();

	/* Compute GPR index associated with argument offset. */
	GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx = std::min(GPR_idx, NumGPRs);
	}

	// Promote integers to 64-bit values.
	if (Arg.getValueType() == MVT::i32 \|\| Arg.getValueType() == MVT::i1) {
	// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
	unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
	}

	// FIXME memcpy is used way more than necessary. Correctness first.
	// Note: "by value" is code for passing a structure by value, not
	// basic types.
	if (Flags.isByVal()) {
	// Note: Size includes alignment padding, so
	// struct x { short a; char b; }
	// will have Size = 4. With #pragma pack(1), it will have Size = 3.
	// These are the proper values we need for right-justifying the
	// aggregate in a parameter register.
	unsigned Size = Flags.getByValSize();

	// An empty aggregate parameter takes up no storage and no
	// registers.
	if (Size == 0)
	continue;

	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	// All aggregates smaller than 8 bytes must be passed right-justified.
	if (Size==1 \|\| Size==2 \|\| Size==4) {
	EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
	if (GPR_idx != NumGPRs) {
	SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
	MachinePointerInfo(), VT);
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	ArgOffset += PtrByteSize;
	continue;
	}
	}

	if (GPR_idx == NumGPRs && Size < 8) {
	SDValue AddPtr = PtrOff;
	if (!isLittleEndian) {
	SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
	PtrOff.getValueType());
	AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	}
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);
	ArgOffset += PtrByteSize;
	continue;
	}
	// Copy entire object into memory. There are cases where gcc-generated
	// code assumes it is there, even if it could be put entirely into
	// registers. (This is not what the doc says.)

	// FIXME: The above statement is likely due to a misunderstanding of the
	// documents. All arguments must be copied into the parameter area BY
	// THE CALLEE in the event that the callee takes the address of any
	// formal argument. That has not yet been implemented. However, it is
	// reasonable to use the stack area as a staging area for the register
	// load.

	// Skip this for small aggregates, as we will use the same slot for a
	// right-justified copy, below.
	if (Size >= 8)
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
	CallSeqStart,
	Flags, DAG, dl);

	// When a register is available, pass a small aggregate right-justified.
	if (Size < 8 && GPR_idx != NumGPRs) {
	// The easiest way to get this right-justified in a register
	// is to copy the structure into the rightmost portion of a
	// local variable slot, then load the whole slot into the
	// register.
	// FIXME: The memcpy seems to produce pretty awful code for
	// small aggregates, particularly for packed ones.
	// FIXME: It would be preferable to use the slot in the
	// parameter save area instead of a new local variable.
	SDValue AddPtr = PtrOff;
	if (!isLittleEndian) {
	SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
	AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	}
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);

	// Load the slot into the register.
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	// Done with this argument.
	ArgOffset += PtrByteSize;
	continue;
	}

	// For aggregates larger than PtrByteSize, copy the pieces of the
	// object that fit into registers from the parameter save area.
	for (unsigned j=0; j<Size; j+=PtrByteSize) {
	SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
	break;
	}
	}
	continue;
	}

	switch (Arg.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (Flags.isNest()) {
	// The 'nest' parameter, if any, is passed in R11.
	RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
	hasNest = true;
	break;
	}

	// These can be scalar arguments or elements of an integer array type
	// passed directly. Clang may use those instead of "byval" aggregate
	// types to avoid forcing arguments to memory unnecessarily.
	if (GPR_idx != NumGPRs) {
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, false, MemOpChains,
	TailCallArguments, dl);
	if (CallConv == CallingConv::Fast)
	ArgOffset += PtrByteSize;
	}
	if (CallConv != CallingConv::Fast)
	ArgOffset += PtrByteSize;
	break;
	case MVT::f32:
	case MVT::f64: {
	// These can be scalar arguments or elements of a float array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// float aggregates.

	// Named arguments go into FPRs first, and once they overflow, the
	// remaining arguments go into GPRs and then the parameter save area.
	// Unnamed arguments for vararg functions always go to GPRs and
	// then the parameter save area. For now, put all arguments to vararg
	// routines always in both locations (FPR and GPR or stack slot).
	bool NeedGPROrStack = isVarArg \|\| FPR_idx == NumFPRs;
	bool NeededLoad = false;

	// First load the argument into the next available FPR.
	if (FPR_idx != NumFPRs)
	RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

	// Next, load the argument into GPR or stack slot if needed.
	if (!NeedGPROrStack)
	;
	else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
	// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
	// once we support fp <-> gpr moves.

	// In the non-vararg case, this can only ever happen in the
	// presence of f32 array types, since otherwise we never run
	// out of FPRs before running out of GPRs.
	SDValue ArgVal;

	// Double values are always passed in a single GPR.
	if (Arg.getValueType() != MVT::f32) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);

	// Non-array float values are extended and passed in a GPR.
	} else if (!Flags.isInConsecutiveRegs()) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);

	// If we have an array of floats, we collect every odd element
	// together with its predecessor into one GPR.
	} else if (ArgOffset % PtrByteSize != 0) {
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	if (!isLittleEndian)
	std::swap(Lo, Hi);
	ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

	// The final element, if even, goes into the first half of a GPR.
	} else if (Flags.isInConsecutiveRegsLast()) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
	if (!isLittleEndian)
	ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
	DAG.getConstant(32, dl, MVT::i32));

	// Non-final even elements are skipped; they will be handled
	// together the with subsequent argument on the next go-around.
	} else
	ArgVal = SDValue();

	if (ArgVal.getNode())
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	// Single-precision floating-point values are mapped to the
	// second (rightmost) word of the stack doubleword.
	if (Arg.getValueType() == MVT::f32 &&
	!isLittleEndian && !Flags.isInConsecutiveRegs()) {
	SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
	}

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, false, MemOpChains,
	TailCallArguments, dl);

	NeededLoad = true;
	}
	// When passing an array of floats, the array occupies consecutive
	// space in the argument area; only round up to the next doubleword
	// at the end of the array. Otherwise, each float takes 8 bytes.
	if (CallConv != CallingConv::Fast \|\| NeededLoad) {
	ArgOffset += (Arg.getValueType() == MVT::f32 &&
	Flags.isInConsecutiveRegs()) ? 4 : 8;
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}
	break;
	}
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	case MVT::f128:
	if (!Subtarget.hasQPX()) {
	// These can be scalar arguments or elements of a vector array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// vector aggregates.

	// For a varargs call, named arguments go into VRs or on the stack as
	// usual; unnamed arguments always go to the stack or the corresponding
	// GPRs when within range. For now, we always put the value in both
	// locations (or even all three).
	if (isVarArg) {
	assert(HasParameterArea &&
	"Parameter area must exist if we have a varargs call.");
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (VR_idx != NumVRs) {
	SDValue Load =
	DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
	}
	ArgOffset += 16;
	for (unsigned i=0; i<16; i+=PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs Altivec params go into VRs or on the stack.
	if (VR_idx != NumVRs) {
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	if (CallConv == CallingConv::Fast)
	ArgOffset += 16;
	}

	if (CallConv != CallingConv::Fast)
	ArgOffset += 16;
	break;
	} // not QPX

	assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
	"Invalid QPX parameter type");

	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v4i1: {
	bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
	if (isVarArg) {
	assert(HasParameterArea &&
	"Parameter area must exist if we have a varargs call.");
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (QFPR_idx != NumQFPRs) {
	SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
	PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
	}
	ArgOffset += (IsF32 ? 16 : 32);
	for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs QPX params go into registers or on the stack.
	if (QFPR_idx != NumQFPRs) {
	RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	if (CallConv == CallingConv::Fast)
	ArgOffset += (IsF32 ? 16 : 32);
	}

	if (CallConv != CallingConv::Fast)
	ArgOffset += (IsF32 ? 16 : 32);
	break;
	}
	}
	}

	assert((!HasParameterArea \|\| NumBytesActuallyUsed == ArgOffset) &&
	"mismatch in size of parameter area");
	(void)NumBytesActuallyUsed;

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Check if this is an indirect call (MTCTR/BCTRL).
	// See prepareDescriptorIndirectCall and buildCallOperands for more
	// information about calls through function pointers in the 64-bit SVR4 ABI.
	if (!isTailCall && !isPatchPoint &&
	!isFunctionGlobalAddress(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) {
	// Load r2 into a virtual register and store it to the TOC save area.
	setUsesTOCBasePtr(DAG);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
	// TOC save area offset.
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	Chain = DAG.getStore(
	Val.getValue(1), dl, Val, AddPtr,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
	// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
	// This does not mean the MTCTR instruction must use R12; it's easier
	// to model this as an extra parameter, so do that.
	if (isELFv2ABI && !isPatchPoint)
	RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (isTailCall && !IsSibCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
	DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
	SPDiff, NumBytes, Ins, InVals, CS);
	}

	SDValue PPCTargetLowering::LowerCall_Darwin(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {
	unsigned NumOps = Outs.size();

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;
	unsigned PtrByteSize = isPPC64 ? 8 : 4;

	MachineFunction &MF = DAG.getMachineFunction();

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, and parameter passing area. We start with 24/48 bytes, which is
	// prereserved space for [SP][CR][LR][3 x unused].
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned NumBytes = LinkageSize;

	// Add up all the space actually used.
	// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
	// they all go in registers, but we must reserve stack space for them for
	// possible use by the caller. In varargs or 64-bit calls, parameters are
	// assigned stack space in order, with padding so Altivec parameters are
	// 16-byte aligned.
	unsigned nAltivecParamsAtEnd = 0;
	for (unsigned i = 0; i != NumOps; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	// Varargs Altivec parameters are padded to a 16 byte boundary.
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64) {
	if (!isVarArg && !isPPC64) {
	// Non-varargs Altivec parameters go after all the non-Altivec
	// parameters; handle those later so we know how much padding we need.
	nAltivecParamsAtEnd++;
	continue;
	}
	// Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
	NumBytes = ((NumBytes+15)/16)*16;
	}
	NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	}

	// Allow for Altivec parameters at the end, if needed.
	if (nAltivecParamsAtEnd) {
	NumBytes = ((NumBytes+15)/16)*16;
	NumBytes += 16*nAltivecParamsAtEnd;
	}

	// The prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if its varargs.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);

	// Tail call needs the stack to be aligned.
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

	// To protect arguments on the stack from being clobbered in a tail call,
	// force all the loads to happen before doing any other lowering.
	if (isTailCall)
	Chain = DAG.getStackArgumentTokenFactor(Chain);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be move somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr;
	if (isPPC64)
	StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
	else
	StackPtr = DAG.getRegister(PPC::R1, MVT::i32);

	// Figure out which arguments are going to go in registers, and which in
	// memory. Also, if this is a vararg function, floating point operations
	// must be stored to our stack, and loaded into integer regs as well, if
	// any integer regs are available for argument passing.
	unsigned ArgOffset = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;

	static const MCPhysReg GPR_32[] = { // 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	static const MCPhysReg GPR_64[] = { // 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};
	const unsigned NumGPRs = array_lengthof(GPR_32);
	const unsigned NumFPRs = 13;
	const unsigned NumVRs = array_lengthof(VR);

	const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;

	SmallVector<SDValue, 8> MemOpChains;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// PtrOff will be used to store the current argument to the stack if a
	// register cannot be found for it.
	SDValue PtrOff;

	PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

	// On PPC64, promote integers to 64-bit values.
	if (isPPC64 && Arg.getValueType() == MVT::i32) {
	// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
	unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
	}

	// FIXME memcpy is used way more than necessary. Correctness first.
	// Note: "by value" is code for passing a structure by value, not
	// basic types.
	if (Flags.isByVal()) {
	unsigned Size = Flags.getByValSize();
	// Very small objects are passed right-justified. Everything else is
	// passed left-justified.
	if (Size==1 \|\| Size==2) {
	EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
	if (GPR_idx != NumGPRs) {
	SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
	MachinePointerInfo(), VT);
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	ArgOffset += PtrByteSize;
	} else {
	SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
	PtrOff.getValueType());
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);
	ArgOffset += PtrByteSize;
	}
	continue;
	}
	// Copy entire object into memory. There are cases where gcc-generated
	// code assumes it is there, even if it could be put entirely into
	// registers. (This is not what the doc says.)
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
	CallSeqStart,
	Flags, DAG, dl);

	// For small aggregates (Darwin only) and aggregates >= PtrByteSize,
	// copy the pieces of the object that fit into registers from the
	// parameter save area.
	for (unsigned j=0; j<Size; j+=PtrByteSize) {
	SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
	break;
	}
	}
	continue;
	}

	switch (Arg.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (GPR_idx != NumGPRs) {
	if (Arg.getValueType() == MVT::i1)
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);

	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
	} else {
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, false, MemOpChains,
	TailCallArguments, dl);
	}
	ArgOffset += PtrByteSize;
	break;
	case MVT::f32:
	case MVT::f64:
	if (FPR_idx != NumFPRs) {
	RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

	if (isVarArg) {
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);

	// Float varargs are always shadowed in available integer registers
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
	SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	} else {
	// If we have any FPRs remaining, we may also have GPRs remaining.
	// Args passed in FPRs consume either 1 (f32) or 2 (f64) available
	// GPRs.
	if (GPR_idx != NumGPRs)
	++GPR_idx;
	if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
	!isPPC64) // PPC64 has 64-bit GPR's obviously :)
	++GPR_idx;
	}
	} else
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, false, MemOpChains,
	TailCallArguments, dl);
	if (isPPC64)
	ArgOffset += 8;
	else
	ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	if (isVarArg) {
	// These go aligned on the stack, or in the corresponding R registers
	// when within range. The Darwin PPC ABI doc claims they also go in
	// V registers; in fact gcc does this only for arguments that are
	// prototyped, not for those that match the ... We do it for all
	// arguments, seems to work.
	while (ArgOffset % 16 !=0) {
	ArgOffset += PtrByteSize;
	if (GPR_idx != NumGPRs)
	GPR_idx++;
	}
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
	DAG.getConstant(ArgOffset, dl, PtrVT));
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (VR_idx != NumVRs) {
	SDValue Load =
	DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
	}
	ArgOffset += 16;
	for (unsigned i=0; i<16; i+=PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs Altivec params generally go in registers, but have
	// stack space allocated at the end.
	if (VR_idx != NumVRs) {
	// Doesn't have GPR space allocated.
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
	} else if (nAltivecParamsAtEnd==0) {
	// We are emitting Altivec params in order.
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	ArgOffset += 16;
	}
	break;
	}
	}
	// If all Altivec parameters fit in registers, as they usually do,
	// they get stack space following the non-Altivec parameters. We
	// don't track this here because nobody below needs it.
	// If there are more Altivec parameters than fit in registers emit
	// the stores here.
	if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
	unsigned j = 0;
	// Offset is aligned; skip 1st 12 params which go in V registers.
	ArgOffset = ((ArgOffset+15)/16)*16;
	ArgOffset += 12*16;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	EVT ArgType = Outs[i].VT;
	if (ArgType==MVT::v4f32 \|\| ArgType==MVT::v4i32 \|\|
	ArgType==MVT::v8i16 \|\| ArgType==MVT::v16i8) {
	if (++j > NumVRs) {
	SDValue PtrOff;
	// We are emitting Altivec params in order.
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	ArgOffset += 16;
	}
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// On Darwin, R12 must contain the address of an indirect callee. This does
	// not mean the MTCTR instruction must use R12; it's easier to model this as
	// an extra parameter, so do that.
	if (!isTailCall &&
	!isFunctionGlobalAddress(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee) &&
	!isBLACompatibleAddress(Callee, DAG))
	RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
	PPC::R12), Callee));

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (isTailCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
	/* unused except on PPC64 ELFv1 */ false, DAG,
	RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
	NumBytes, Ins, InVals, CS);
	}

	static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
	CCState &State) {

	if (ValVT == MVT::f128)
	report_fatal_error("f128 is unimplemented on AIX.");

	if (ArgFlags.isByVal())
	report_fatal_error("Passing structure by value is unimplemented.");

	if (ArgFlags.isNest())
	report_fatal_error("Nest arguments are unimplemented.");

	if (ValVT.isVector() \|\| LocVT.isVector())
	report_fatal_error("Vector arguments are unimplemented on AIX.");

	const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
	State.getMachineFunction().getSubtarget());
	const bool IsPPC64 = Subtarget.isPPC64();
	const unsigned PtrByteSize = IsPPC64 ? 8 : 4;

	static const MCPhysReg GPR_32[] = {// 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10};
	static const MCPhysReg GPR_64[] = {// 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10};

	// Arguments always reserve parameter save area.
	switch (ValVT.SimpleTy) {
	default:
	report_fatal_error("Unhandled value type for argument.");
	case MVT::i64:
	// i64 arguments should have been split to i32 for PPC32.
	assert(IsPPC64 && "PPC32 should have split i64 values.");
	LLVM_FALLTHROUGH;
	case MVT::i1:
	case MVT::i32:
	State.AllocateStack(PtrByteSize, PtrByteSize);
	if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
	MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
	// Promote integers if needed.
	if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
	LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
	: CCValAssign::LocInfo::ZExt;
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
	}
	else
	report_fatal_error("Handling of placing parameters on the stack is "
	"unimplemented!");
	return false;

	case MVT::f32:
	case MVT::f64: {
	// Parameter save area (PSA) is reserved even if the float passes in fpr.
	const unsigned StoreSize = LocVT.getStoreSize();
	// Floats are always 4-byte aligned in the PSA on AIX.
	// This includes f64 in 64-bit mode for ABI compatibility.
	State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);
	if (unsigned Reg = State.AllocateReg(FPR))
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	else
	report_fatal_error("Handling of placing parameters on the stack is "
	"unimplemented!");

	// AIX requires that GPRs are reserved for float arguments.
	// Successfully reserved GPRs are only initialized for vararg calls.
	MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
	for (unsigned I = 0; I < StoreSize; I += PtrByteSize) {
	if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
	if (State.isVarArg()) {
	// Custom handling is required for:
	// f64 in PPC32 needs to be split into 2 GPRs.
	// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
	State.addLoc(
	CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
	}
	} else if (State.isVarArg()) {
	report_fatal_error("Handling of placing parameters on the stack is "
	"unimplemented!");
	}
	}

	return false;
	}
	}
	return true;
	}

	static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
	bool IsPPC64) {
	assert((IsPPC64 \|\| SVT != MVT::i64) &&
	"i64 should have been split for 32-bit codegen.");

	switch (SVT) {
	default:
	report_fatal_error("Unexpected value type for formal argument");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	case MVT::f32:
	return &PPC::F4RCRegClass;
	case MVT::f64:
	return &PPC::F8RCRegClass;
	}
	}

	static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
	SelectionDAG &DAG, SDValue ArgValue,
	MVT LocVT, const SDLoc &dl) {
	assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
	assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());

	if (Flags.isSExt())
	ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
	DAG.getValueType(ValVT));
	else if (Flags.isZExt())
	ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
	DAG.getValueType(ValVT));

	return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
	}

	SDValue PPCTargetLowering::LowerFormalArguments_AIX(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	assert((CallConv == CallingConv::C \|\| CallConv == CallingConv::Cold \|\|
	CallConv == CallingConv::Fast) &&
	"Unexpected calling convention!");

	if (isVarArg)
	report_fatal_error("This call type is unimplemented on AIX.");

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	report_fatal_error("Tail call support is unimplemented on AIX.");

	if (useSoftFloat())
	report_fatal_error("Soft float support is unimplemented on AIX.");

	const PPCSubtarget &Subtarget =
	static_cast<const PPCSubtarget &>(DAG.getSubtarget());
	if (Subtarget.hasQPX())
	report_fatal_error("QPX support is not supported on AIX.");

	const bool IsPPC64 = Subtarget.isPPC64();
	const unsigned PtrByteSize = IsPPC64 ? 8 : 4;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	MachineFunction &MF = DAG.getMachineFunction();
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Reserve space for the linkage area on the stack.
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	// On AIX a minimum of 8 words is saved to the parameter save area.
	const unsigned MinParameterSaveArea = 8 * PtrByteSize;
	CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);
	CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue;
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	if (VA.isRegLoc()) {
	EVT ValVT = VA.getValVT();
	MVT LocVT = VA.getLocVT();
	MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
	unsigned VReg =
	MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
	ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
	if (ValVT.isScalarInteger() &&
	(ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
	ArgValue =
	truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
	}
	InVals.push_back(ArgValue);
	} else {
	report_fatal_error("Handling of formal arguments on the stack is "
	"unimplemented!");
	}
	}

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea = CCInfo.getNextStackOffset();

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized function's reserved stack space needs to be aligned so
	// that taking the difference between two stack areas will result in an
	// aligned stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setMinReservedArea(MinReservedArea);

	return Chain;
	}

	SDValue PPCTargetLowering::LowerCall_AIX(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {

	assert((CallConv == CallingConv::C \|\|
	CallConv == CallingConv::Cold \|\|
	CallConv == CallingConv::Fast) && "Unexpected calling convention!");

	if (isPatchPoint)
	report_fatal_error("This call type is unimplemented on AIX.");

	const PPCSubtarget& Subtarget =
	static_cast<const PPCSubtarget&>(DAG.getSubtarget());
	if (Subtarget.hasQPX())
	report_fatal_error("QPX is not supported on AIX.");
	if (Subtarget.hasAltivec())
	report_fatal_error("Altivec support is unimplemented on AIX.");

	MachineFunction &MF = DAG.getMachineFunction();
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Reserve space for the linkage save area (LSA) on the stack.
	// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
	// [SP][CR][LR][2 x reserved][TOC].
	// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	const bool IsPPC64 = Subtarget.isPPC64();
	const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
	CCInfo.AllocateStack(LinkageSize, PtrByteSize);
	CCInfo.AnalyzeCallOperands(Outs, CC_AIX);

	// The prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if the callee
	// is variadic.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
	const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize;

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass.
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
	CCValAssign &VA = ArgLocs[I++];

	if (VA.isMemLoc())
	report_fatal_error("Handling of placing parameters on the stack is "
	"unimplemented!");
	if (!VA.isRegLoc())
	report_fatal_error(
	"Unexpected non-register location for function call argument.");

	SDValue Arg = OutVals[VA.getValNo()];

	if (!VA.needsCustom()) {
	switch (VA.getLocInfo()) {
	default:
	report_fatal_error("Unexpected argument extension type.");
	case CCValAssign::Full:
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	}
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

	continue;
	}

	// Custom handling is used for GPR initializations for vararg float
	// arguments.
	assert(isVarArg && VA.getValVT().isFloatingPoint() &&
	VA.getLocVT().isInteger() &&
	"Unexpected custom register handling for calling convention.");

	SDValue ArgAsInt =
	DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg);

	if (Arg.getValueType().getStoreSize() == VA.getLocVT().getStoreSize())
	// f32 in 32-bit GPR
	// f64 in 64-bit GPR
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
	else if (Arg.getValueType().getSizeInBits() < VA.getLocVT().getSizeInBits())
	// f32 in 64-bit GPR.
	RegsToPass.push_back(std::make_pair(
	VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, VA.getLocVT())));
	else {
	// f64 in two 32-bit GPRs
	// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
	assert(Arg.getValueType() == MVT::f64 && isVarArg && !IsPPC64 &&
	"Unexpected custom register for argument!");
	CCValAssign &GPR1 = VA;
	SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
	DAG.getConstant(32, dl, MVT::i8));
	RegsToPass.push_back(std::make_pair(
	GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
	assert(I != E && "A second custom GPR is expected!");
	CCValAssign &GPR2 = ArgLocs[I++];
	assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() &&
	GPR2.needsCustom() && "A second custom GPR is expected!");
	RegsToPass.push_back(std::make_pair(
	GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
	}
	}

	// For indirect calls, we need to save the TOC base to the stack for
	// restoration after the call.
	if (!isTailCall && !isPatchPoint &&
	!isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) {
	const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
	const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
	const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	const unsigned TOCSaveOffset =
	Subtarget.getFrameLowering()->getTOCSaveOffset();

	setUsesTOCBasePtr(DAG);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);
	SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	Chain = DAG.getStore(
	Val.getValue(1), dl, Val, AddPtr,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	const int SPDiff = 0;
	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
	/* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass,
	InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins,
	InVals, CS);
	}

	bool
	PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(
	Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
	? RetCC_PPC_Cold
	: RetCC_PPC);
	}

	SDValue
	PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs,
	(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
	? RetCC_PPC_Cold
	: RetCC_PPC);

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Arg = OutVals[RealResIdx];

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	}
	if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
	bool isLittleEndian = Subtarget.isLittleEndian();
	// Legalize ret f64 -> ret 2 x i32.
	SDValue SVal =
	DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
	DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
	Flag = Chain.getValue(1);
	VA = RVLocs[++i]; // skip ahead to next loc
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
	} else
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {

	if (PPC::G8RCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (PPC::F8RCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else if (PPC::CRRCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i1));
	else if (PPC::VRRCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::Other));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
	}

	SDValue
	PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);

	// Get the correct type for integers.
	EVT IntVT = Op.getValueType();

	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue FPSIdx = getFramePointerFrameIndex(DAG);
	// Build a DYNAREAOFFSET node.
	SDValue Ops[2] = {Chain, FPSIdx};
	SDVTList VTs = DAG.getVTList(IntVT);
	return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
	}

	SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
	SelectionDAG &DAG) const {
	// When we pop the dynamic allocation we need to restore the SP link.
	SDLoc dl(Op);

	// Get the correct type for pointers.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// Construct the stack pointer operand.
	bool isPPC64 = Subtarget.isPPC64();
	unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
	SDValue StackPtr = DAG.getRegister(SP, PtrVT);

	// Get the operands for the STACKRESTORE.
	SDValue Chain = Op.getOperand(0);
	SDValue SaveSP = Op.getOperand(1);

	// Load the old link SP.
	SDValue LoadLinkSP =
	DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());

	// Restore the stack pointer.
	Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);

	// Store the old link SP.
	return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Get current frame pointer save index. The users of this index will be
	// primarily DYNALLOC instructions.
	PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
	int RASI = FI->getReturnAddrSaveIndex();

	// If the frame pointer save index hasn't been defined yet.
	if (!RASI) {
	// Find out what the fix offset of the frame pointer save area.
	int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
	// Allocate the frame index for frame pointer save area.
	RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
	// Save the result.
	FI->setReturnAddrSaveIndex(RASI);
	}
	return DAG.getFrameIndex(RASI, PtrVT);
	}

	SDValue
	PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Get current frame pointer save index. The users of this index will be
	// primarily DYNALLOC instructions.
	PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
	int FPSI = FI->getFramePointerSaveIndex();

	// If the frame pointer save index hasn't been defined yet.
	if (!FPSI) {
	// Find out what the fix offset of the frame pointer save area.
	int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
	// Allocate the frame index for frame pointer save area.
	FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
	// Save the result.
	FI->setFramePointerSaveIndex(FPSI);
	}
	return DAG.getFrameIndex(FPSI, PtrVT);
	}

	SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	SDLoc dl(Op);

	// Get the correct type for pointers.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	// Negate the size.
	SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
	DAG.getConstant(0, dl, PtrVT), Size);
	// Construct a node for the frame pointer save index.
	SDValue FPSIdx = getFramePointerFrameIndex(DAG);
	// Build a DYNALLOC node.
	SDValue Ops[3] = { Chain, NegSize, FPSIdx };
	SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
	return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
	}

	SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorLoad(Op, DAG);

	assert(Op.getValueType() == MVT::i1 &&
	"Custom lowering only for i1 loads");

	// First, load 8 bits into 32 bits, then truncate to 1 bit.

	SDLoc dl(Op);
	LoadSDNode *LD = cast<LoadSDNode>(Op);

	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	MachineMemOperand *MMO = LD->getMemOperand();

	SDValue NewLD =
	DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
	BasePtr, MVT::i8, MMO);
	SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);

	SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getOperand(1).getValueType().isVector())
	return LowerVectorStore(Op, DAG);

	assert(Op.getOperand(1).getValueType() == MVT::i1 &&
	"Custom lowering only for i1 stores");

	// First, zero extend to 32 bits, then use a truncating store to 8 bits.

	SDLoc dl(Op);
	StoreSDNode *ST = cast<StoreSDNode>(Op);

	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	SDValue Value = ST->getValue();
	MachineMemOperand *MMO = ST->getMemOperand();

	Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
	Value);
	return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
	}

	// FIXME: Remove this once the ANDI glue bug is fixed:
	SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::i1 &&
	"Custom lowering only for i1 results");

	SDLoc DL(Op);
	return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));
	}

	SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
	SelectionDAG &DAG) const {

	// Implements a vector truncate that fits in a vector register as a shuffle.
	// We want to legalize vector truncates down to where the source fits in
	// a vector register (and target is therefore smaller than vector register
	// size). At that point legalization will try to custom lower the sub-legal
	// result and get here - where we can contain the truncate as a single target
	// operation.

	// For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
	// <MSB1\|LSB1, MSB2\|LSB2> to <LSB1, LSB2>
	//
	// We will implement it for big-endian ordering as this (where x denotes
	// undefined):
	// < MSB1\|LSB1, MSB2\|LSB2, uu, uu, uu, uu, uu, uu> to
	// < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
	//
	// The same operation in little-endian ordering will be:
	// <uu, uu, uu, uu, uu, uu, LSB2\|MSB2, LSB1\|MSB1> to
	// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>

	assert(Op.getValueType().isVector() && "Vector type expected.");

	SDLoc DL(Op);
	SDValue N1 = Op.getOperand(0);
	unsigned SrcSize = N1.getValueType().getSizeInBits();
	assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
	SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);

	EVT TrgVT = Op.getValueType();
	unsigned TrgNumElts = TrgVT.getVectorNumElements();
	EVT EltVT = TrgVT.getVectorElementType();
	unsigned WideNumElts = 128 / EltVT.getSizeInBits();
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);

	// First list the elements we want to keep.
	unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
	SmallVector<int, 16> ShuffV;
	if (Subtarget.isLittleEndian())
	for (unsigned i = 0; i < TrgNumElts; ++i)
	ShuffV.push_back(i * SizeMult);
	else
	for (unsigned i = 1; i <= TrgNumElts; ++i)
	ShuffV.push_back(i * SizeMult - 1);

	// Populate the remaining elements with undefs.
	for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
	// ShuffV.push_back(i + WideNumElts);
	ShuffV.push_back(WideNumElts + 1);

	SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
	return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
	}

	/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
	/// possible.
	SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
	// Not FP? Not a fsel.
	if (!Op.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Op.getOperand(2).getValueType().isFloatingPoint())
	return Op;

	bool HasNoInfs = DAG.getTarget().Options.NoInfsFPMath;
	bool HasNoNaNs = DAG.getTarget().Options.NoNaNsFPMath;
	// We might be able to do better than this under some circumstances, but in
	// general, fsel-based lowering of select is a finite-math-only optimization.
	// For more information, see section F.3 of the 2.06 ISA specification.
	// With ISA 3.0, we have xsmaxcdp/xsmincdp which are OK to emit even in the
	// presence of infinities.
	if (!Subtarget.hasP9Vector() && (!HasNoInfs \|\| !HasNoNaNs))
	return Op;
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

	EVT ResVT = Op.getValueType();
	EVT CmpVT = Op.getOperand(0).getValueType();
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
	SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
	SDLoc dl(Op);

	if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
	switch (CC) {
	default:
	// Not a min/max but with finite math, we may still be able to use fsel.
	if (HasNoInfs && HasNoNaNs)
	break;
	return Op;
	case ISD::SETOGT:
	case ISD::SETGT:
	return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
	case ISD::SETOLT:
	case ISD::SETLT:
	return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS);
	}
	}

	// TODO: Propagate flags from the select rather than global settings.
	SDNodeFlags Flags;
	Flags.setNoInfs(true);
	Flags.setNoNaNs(true);

	// If the RHS of the comparison is a 0.0, we don't need to do the
	// subtraction at all.
	SDValue Sel1;
	if (isFloatingPointZero(RHS))
	switch (CC) {
	default: break; // SETUO etc aren't handled by fsel.
	case ISD::SETNE:
	std::swap(TV, FV);
	LLVM_FALLTHROUGH;
	case ISD::SETEQ:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
	if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
	Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
	case ISD::SETULT:
	case ISD::SETLT:
	std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
	LLVM_FALLTHROUGH;
	case ISD::SETOGE:
	case ISD::SETGE:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
	case ISD::SETUGT:
	case ISD::SETGT:
	std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
	LLVM_FALLTHROUGH;
	case ISD::SETOLE:
	case ISD::SETLE:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
	}

	SDValue Cmp;
	switch (CC) {
	default: break; // SETUO etc aren't handled by fsel.
	case ISD::SETNE:
	std::swap(TV, FV);
	LLVM_FALLTHROUGH;
	case ISD::SETEQ:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
	Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
	case ISD::SETULT:
	case ISD::SETLT:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
	case ISD::SETOGE:
	case ISD::SETGE:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	case ISD::SETUGT:
	case ISD::SETGT:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
	case ISD::SETOLE:
	case ISD::SETLE:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	}
	return Op;
	}

	void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert(Op.getOperand(0).getValueType().isFloatingPoint());
	SDValue Src = Op.getOperand(0);
	if (Src.getValueType() == MVT::f32)
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);

	SDValue Tmp;
	switch (Op.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
	case MVT::i32:
	Tmp = DAG.getNode(
	Op.getOpcode() == ISD::FP_TO_SINT
	? PPCISD::FCTIWZ
	: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
	dl, MVT::f64, Src);
	break;
	case MVT::i64:
	assert((Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT()) &&
	"i64 FP_TO_UINT is supported only with FPCVT");
	Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ,
	dl, MVT::f64, Src);
	break;
	}

	// Convert the FP value to an int value through memory.
	bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
	(Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT());
	SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
	int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Emit a store to the stack slot.
	SDValue Chain;
	unsigned Alignment = DAG.getEVTAlignment(Tmp.getValueType());
	if (i32Stack) {
	MachineFunction &MF = DAG.getMachineFunction();
	Alignment = 4;
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
	SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
	Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
	DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
	} else
	Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment);

	// Result is a load from the stack slot. If loading 4 bytes, make sure to
	// add in a bias on big endian.
	if (Op.getValueType() == MVT::i32 && !i32Stack) {
	FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
	DAG.getConstant(4, dl, FIPtr.getValueType()));
	MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
	}

	RLI.Chain = Chain;
	RLI.Ptr = FIPtr;
	RLI.MPI = MPI;
	RLI.Alignment = Alignment;
	}

	/// Custom lowers floating point to integer conversions to use
	/// the direct move instructions available in ISA 2.07 to avoid the
	/// need for load/store combinations.
	SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert(Op.getOperand(0).getValueType().isFloatingPoint());
	SDValue Src = Op.getOperand(0);

	if (Src.getValueType() == MVT::f32)
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);

	SDValue Tmp;
	switch (Op.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
	case MVT::i32:
	Tmp = DAG.getNode(
	Op.getOpcode() == ISD::FP_TO_SINT
	? PPCISD::FCTIWZ
	: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
	dl, MVT::f64, Src);
	Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
	break;
	case MVT::i64:
	assert((Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT()) &&
	"i64 FP_TO_UINT is supported only with FPCVT");
	Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ,
	dl, MVT::f64, Src);
	Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
	break;
	}
	return Tmp;
	}

	SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const {

	// FP to INT conversions are legal for f128.
	if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
	return Op;

	// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
	// PPC (the libcall is not available).
	if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
	if (Op.getValueType() == MVT::i32) {
	if (Op.getOpcode() == ISD::FP_TO_SINT) {
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
	MVT::f64, Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
	MVT::f64, Op.getOperand(0),
	DAG.getIntPtrConstant(1, dl));

	// Add the two halves of the long double in round-to-zero mode.
	SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);

	// Now use a smaller FP_TO_SINT.
	return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
	}
	if (Op.getOpcode() == ISD::FP_TO_UINT) {
	const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
	APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
	SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
	// X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
	// FIXME: generated code sucks.
	// TODO: Are there fast-math-flags to propagate to this FSUB?
	SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
	Op.getOperand(0), Tmp);
	True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
	True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
	DAG.getConstant(0x80000000, dl, MVT::i32));
	SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
	Op.getOperand(0));
	return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
	ISD::SETGE);
	}
	}

	return SDValue();
	}

	if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
	return LowerFP_TO_INTDirectMove(Op, DAG, dl);

	ReuseLoadInfo RLI;
	LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);

	return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
	RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
	}

	// We're trying to insert a regular store, S, and then a load, L. If the
	// incoming value, O, is a load, we might just be able to have our load use the
	// address used by O. However, we don't know if anything else will store to
	// that address before we can load from it. To prevent this situation, we need
	// to insert our load, L, into the chain as a peer of O. To do this, we give L
	// the same chain operand as O, we create a token factor from the chain results
	// of O and L, and we replace all uses of O's chain result with that token
	// factor (see spliceIntoChain below for this last part).
	bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
	ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	ISD::LoadExtType ET) const {
	SDLoc dl(Op);
	bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
	(Subtarget.hasFPCVT() \|\| Op.getValueType() == MVT::i32);
	if (ET == ISD::NON_EXTLOAD &&
	(ValidFPToUint \|\| Op.getOpcode() == ISD::FP_TO_SINT) &&
	isOperationLegalOrCustom(Op.getOpcode(),
	Op.getOperand(0).getValueType())) {

	LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
	return true;
	}

	LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
	if (!LD \|\| LD->getExtensionType() != ET \|\| LD->isVolatile() \|\|
	LD->isNonTemporal())
	return false;
	if (LD->getMemoryVT() != MemVT)
	return false;

	RLI.Ptr = LD->getBasePtr();
	if (LD->isIndexed() && !LD->getOffset().isUndef()) {
	assert(LD->getAddressingMode() == ISD::PRE_INC &&
	"Non-pre-inc AM on PPC?");
	RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
	LD->getOffset());
	}

	RLI.Chain = LD->getChain();
	RLI.MPI = LD->getPointerInfo();
	RLI.IsDereferenceable = LD->isDereferenceable();
	RLI.IsInvariant = LD->isInvariant();
	RLI.Alignment = LD->getAlignment();
	RLI.AAInfo = LD->getAAInfo();
	RLI.Ranges = LD->getRanges();

	RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
	return true;
	}

	// Given the head of the old chain, ResChain, insert a token factor containing
	// it and NewResChain, and make users of ResChain now be users of that token
	// factor.
	// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
	void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
	SDValue NewResChain,
	SelectionDAG &DAG) const {
	if (!ResChain)
	return;

	SDLoc dl(NewResChain);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	NewResChain, DAG.getUNDEF(MVT::Other));
	assert(TF.getNode() != NewResChain.getNode() &&
	"A new TF really is required here");

	DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
	DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
	}

	/// Analyze profitability of direct move
	/// prefer float load to int load plus direct move
	/// when there is no integer use of int load
	bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
	SDNode *Origin = Op.getOperand(0).getNode();
	if (Origin->getOpcode() != ISD::LOAD)
	return true;

	// If there is no LXSIBZX/LXSIHZX, like Power8,
	// prefer direct move if the memory size is 1 or 2 bytes.
	MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
	if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
	return true;

	for (SDNode::use_iterator UI = Origin->use_begin(),
	UE = Origin->use_end();
	UI != UE; ++UI) {

	// Only look at the users of the loaded value.
	if (UI.getUse().get().getResNo() != 0)
	continue;

	if (UI->getOpcode() != ISD::SINT_TO_FP &&
	UI->getOpcode() != ISD::UINT_TO_FP)
	return true;
	}

	return false;
	}

	/// Custom lowers integer to floating point conversions to use
	/// the direct move instructions available in ISA 2.07 to avoid the
	/// need for load/store combinations.
	SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert((Op.getValueType() == MVT::f32 \|\|
	Op.getValueType() == MVT::f64) &&
	"Invalid floating point type as target of conversion");
	assert(Subtarget.hasFPCVT() &&
	"Int to FP conversions with direct moves require FPCVT");
	SDValue FP;
	SDValue Src = Op.getOperand(0);
	bool SinglePrec = Op.getValueType() == MVT::f32;
	bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
	bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
	unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
	(SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);

	if (WordInt) {
	FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
	dl, MVT::f64, Src);
	FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
	}
	else {
	FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
	FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
	}

	return FP;
	}

	static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {

	EVT VecVT = Vec.getValueType();
	assert(VecVT.isVector() && "Expected a vector type.");
	assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");

	EVT EltVT = VecVT.getVectorElementType();
	unsigned WideNumElts = 128 / EltVT.getSizeInBits();
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);

	unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(NumConcat);
	Ops[0] = Vec;
	SDValue UndefVec = DAG.getUNDEF(VecVT);
	for (unsigned i = 1; i < NumConcat; ++i)
	Ops[i] = UndefVec;

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
	}

	SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const {

	unsigned Opc = Op.getOpcode();
	assert((Opc == ISD::UINT_TO_FP \|\| Opc == ISD::SINT_TO_FP) &&
	"Unexpected conversion type");
	assert((Op.getValueType() == MVT::v2f64 \|\| Op.getValueType() == MVT::v4f32) &&
	"Supports conversions to v2f64/v4f32 only.");

	bool SignedConv = Opc == ISD::SINT_TO_FP;
	bool FourEltRes = Op.getValueType() == MVT::v4f32;

	SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
	EVT WideVT = Wide.getValueType();
	unsigned WideNumElts = WideVT.getVectorNumElements();
	MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;

	SmallVector<int, 16> ShuffV;
	for (unsigned i = 0; i < WideNumElts; ++i)
	ShuffV.push_back(i + WideNumElts);

	int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
	int SaveElts = FourEltRes ? 4 : 2;
	if (Subtarget.isLittleEndian())
	for (int i = 0; i < SaveElts; i++)
	ShuffV[i * Stride] = i;
	else
	for (int i = 1; i <= SaveElts; i++)
	ShuffV[i * Stride - 1] = i - 1;

	SDValue ShuffleSrc2 =
	SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
	SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
	unsigned ExtendOp =
	SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST;

	SDValue Extend;
	if (!Subtarget.hasP9Altivec() && SignedConv) {
	Arrange = DAG.getBitcast(IntermediateVT, Arrange);
	Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
	DAG.getValueType(Op.getOperand(0).getValueType()));
	} else
	Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange);

	return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
	}

	SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);

	EVT InVT = Op.getOperand(0).getValueType();
	EVT OutVT = Op.getValueType();
	if (OutVT.isVector() && OutVT.isFloatingPoint() &&
	isOperationCustom(Op.getOpcode(), InVT))
	return LowerINT_TO_FPVector(Op, DAG, dl);

	// Conversions to f128 are legal.
	if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
	return Op;

	if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
	if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
	return SDValue();

	SDValue Value = Op.getOperand(0);
	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	if (Op.getValueType() != MVT::v4f64)
	Value = DAG.getNode(ISD::FP_ROUND, dl,
	Op.getValueType(), Value,
	DAG.getIntPtrConstant(1, dl));
	return Value;
	}

	// Don't handle ppc_fp128 here; let it be lowered to a libcall.
	if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
	return SDValue();

	if (Op.getOperand(0).getValueType() == MVT::i1)
	return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
	DAG.getConstantFP(1.0, dl, Op.getValueType()),
	DAG.getConstantFP(0.0, dl, Op.getValueType()));

	// If we have direct moves, we can do all the conversion, skip the store/load
	// however, without FPCVT we can't do most conversions.
	if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
	Subtarget.isPPC64() && Subtarget.hasFPCVT())
	return LowerINT_TO_FPDirectMove(Op, DAG, dl);

	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\| Subtarget.hasFPCVT()) &&
	"UINT_TO_FP is supported only with FPCVT");

	// If we have FCFIDS, then use it when converting to single-precision.
	// Otherwise, convert to double-precision and then round.
	unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
	: PPCISD::FCFIDS)
	: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
	: PPCISD::FCFID);
	MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? MVT::f32
	: MVT::f64;

	if (Op.getOperand(0).getValueType() == MVT::i64) {
	SDValue SINT = Op.getOperand(0);
	// When converting to single-precision, we actually need to convert
	// to double-precision first and then round to single-precision.
	// To avoid double-rounding effects during that operation, we have
	// to prepare the input operand. Bits that might be truncated when
	// converting to double-precision are replaced by a bit that won't
	// be lost at this stage, but is below the single-precision rounding
	// position.
	//
	// However, if -enable-unsafe-fp-math is in effect, accept double
	// rounding to avoid the extra overhead.
	if (Op.getValueType() == MVT::f32 &&
	!Subtarget.hasFPCVT() &&
	!DAG.getTarget().Options.UnsafeFPMath) {

	// Twiddle input to make sure the low 11 bits are zero. (If this
	// is the case, we are guaranteed the value will fit into the 53 bit
	// mantissa of an IEEE double-precision value without rounding.)
	// If any of those low 11 bits were not zero originally, make sure
	// bit 12 (value 2048) is set instead, so that the final rounding
	// to single-precision gets the correct result.
	SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
	SINT, DAG.getConstant(2047, dl, MVT::i64));
	Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
	Round, DAG.getConstant(2047, dl, MVT::i64));
	Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
	Round = DAG.getNode(ISD::AND, dl, MVT::i64,
	Round, DAG.getConstant(-2048, dl, MVT::i64));

	// However, we cannot use that value unconditionally: if the magnitude
	// of the input value is small, the bit-twiddling we did above might
	// end up visibly changing the output. Fortunately, in that case, we
	// don't need to twiddle bits since the original input will convert
	// exactly to double-precision floating-point already. Therefore,
	// construct a conditional to use the original value if the top 11
	// bits are all sign-bit copies, and use the rounded value computed
	// above otherwise.
	SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
	SINT, DAG.getConstant(53, dl, MVT::i32));
	Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
	Cond, DAG.getConstant(1, dl, MVT::i64));
	Cond = DAG.getSetCC(dl, MVT::i32,
	Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);

	SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
	}

	ReuseLoadInfo RLI;
	SDValue Bits;

	MachineFunction &MF = DAG.getMachineFunction();
	if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
	Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
	RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (Subtarget.hasLFIWAX() &&
	canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (Subtarget.hasFPCVT() &&
	canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (((Subtarget.hasLFIWAX() &&
	SINT.getOpcode() == ISD::SIGN_EXTEND) \|\|
	(Subtarget.hasFPCVT() &&
	SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
	SINT.getOperand(0).getValueType() == MVT::i32) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	int FrameIdx = MFI.CreateStackObject(4, 4, false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FrameIdx));

	assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
	"Expected an i32 store");

	RLI.Ptr = FIdx;
	RLI.Chain = Store;
	RLI.MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	RLI.Alignment = 4;

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
	PPCISD::LFIWZX : PPCISD::LFIWAX,
	dl, DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	} else
	Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);

	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);

	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
	FP = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
	return FP;
	}

	assert(Op.getOperand(0).getValueType() == MVT::i32 &&
	"Unhandled INT_TO_FP type in custom expander!");
	// Since we only generate this in 64-bit mode, we can take advantage of
	// 64-bit registers. In particular, sign extend the input value into the
	// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
	// then lfd it and fcfid it.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	SDValue Ld;
	if (Subtarget.hasLFIWAX() \|\| Subtarget.hasFPCVT()) {
	ReuseLoadInfo RLI;
	bool ReusingLoad;
	if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
	DAG))) {
	int FrameIdx = MFI.CreateStackObject(4, 4, false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FrameIdx));

	assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
	"Expected an i32 store");

	RLI.Ptr = FIdx;
	RLI.Chain = Store;
	RLI.MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	RLI.Alignment = 4;
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
	PPCISD::LFIWZX : PPCISD::LFIWAX,
	dl, DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	if (ReusingLoad)
	spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
	} else {
	assert(Subtarget.isPPC64() &&
	"i32->FP without LFIWAX supported only on PPC64");

	int FrameIdx = MFI.CreateStackObject(8, 8, false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
	Op.getOperand(0));

	// STD the extended value into the stack slot.
	SDValue Store = DAG.getStore(
	DAG.getEntryNode(), dl, Ext64, FIdx,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));

	// Load the value as a double.
	Ld = DAG.getLoad(
	MVT::f64, dl, Store, FIdx,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
	}

	// FCFID it and return it.
	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
	FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
	DAG.getIntPtrConstant(0, dl));
	return FP;
	}

	SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	/*
	The rounding mode is in bits 30:31 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to 0
	10 Round to +inf
	11 Round to -inf

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	EVT VT = Op.getValueType();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Save FP Control Word to register
	EVT NodeTys[] = {
	MVT::f64, // return register
	MVT::Glue // unused in this context
	};
	SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);

	// Save FP register to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
	MachinePointerInfo());

	// Load FP Control Word from low 32 bits of stack slot.
	SDValue Four = DAG.getConstant(4, dl, PtrVT);
	SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
	SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::AND, dl, MVT::i32,
	CWD, DAG.getConstant(3, dl, MVT::i32));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, dl, MVT::i32,
	DAG.getNode(ISD::AND, dl, MVT::i32,
	DAG.getNode(ISD::XOR, dl, MVT::i32,
	CWD, DAG.getConstant(3, dl, MVT::i32)),
	DAG.getConstant(3, dl, MVT::i32)),
	DAG.getConstant(1, dl, MVT::i32));

	SDValue RetVal =
	DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
	}

	SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	SDLoc dl(Op);
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SHL!");

	// Expand into a bunch of logical ops. Note that these ops
	// depend on the PPC behavior for oversized shift amounts.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
	SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
	SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	unsigned BitWidth = VT.getSizeInBits();
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SRL!");

	// Expand into a bunch of logical ops. Note that these ops
	// depend on the PPC behavior for oversized shift amounts.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
	SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
	SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SRA!");

	// Expand into a bunch of logical ops, followed by a select_cc.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
	SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
	SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
	Tmp4, Tmp6, ISD::SETLE);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	//===----------------------------------------------------------------------===//
	// Vector related lowering.
	//

	/// BuildSplatI - Build a canonical splati of Val with an element size of
	/// SplatSize. Cast the result to VT.
	static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
	SelectionDAG &DAG, const SDLoc &dl) {
	static const MVT VTys[] = { // canonical VT to use for each size.
	MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
	};

	EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];

	// Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
	if (Val == -1)
	SplatSize = 1;

	EVT CanonicalVT = VTys[SplatSize-1];

	// Build a canonical splat for this value.
	return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
	}

	/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl, EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = Op.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), Op);
	}

	/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG, const SDLoc &dl,
	EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = LHS.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
	}

	/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
	SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
	EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = Op0.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
	}

	/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
	/// amount. The result has the specified value type.
	static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
	SelectionDAG &DAG, const SDLoc &dl) {
	// Force LHS/RHS to be the right type.
	LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
	RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);

	int Ops[16];
	for (unsigned i = 0; i != 16; ++i)
	Ops[i] = i + Amt;
	SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
	return DAG.getNode(ISD::BITCAST, dl, VT, T);
	}

	/// Do we have an efficient pattern in a .td file for this node?
	///
	/// \param V - pointer to the BuildVectorSDNode being matched
	/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
	///
	/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
	/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
	/// the opposite is true (expansion is beneficial) are:
	/// - The node builds a vector out of integers that are not 32 or 64-bits
	/// - The node builds a vector out of constants
	/// - The node is a "load-and-splat"
	/// In all other cases, we will choose to keep the BUILD_VECTOR.
	static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
	bool HasDirectMove,
	bool HasP8Vector) {
	EVT VecVT = V->getValueType(0);
	bool RightType = VecVT == MVT::v2f64 \|\|
	(HasP8Vector && VecVT == MVT::v4f32) \|\|
	(HasDirectMove && (VecVT == MVT::v2i64 \|\| VecVT == MVT::v4i32));
	if (!RightType)
	return false;

	bool IsSplat = true;
	bool IsLoad = false;
	SDValue Op0 = V->getOperand(0);

	// This function is called in a block that confirms the node is not a constant
	// splat. So a constant BUILD_VECTOR here means the vector is built out of
	// different constants.
	if (V->isConstant())
	return false;
	for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
	if (V->getOperand(i).isUndef())
	return false;
	// We want to expand nodes that represent load-and-splat even if the
	// loaded value is a floating point truncation or conversion to int.
	if (V->getOperand(i).getOpcode() == ISD::LOAD \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
	IsLoad = true;
	// If the operands are different or the input is not a load and has more
	// uses than just this BV node, then it isn't a splat.
	if (V->getOperand(i) != Op0 \|\|
	(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
	IsSplat = false;
	}
	return !(IsSplat && IsLoad);
	}

	// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
	SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {

	SDLoc dl(Op);
	SDValue Op0 = Op->getOperand(0);

	if (!EnableQuadPrecision \|\|
	(Op.getValueType() != MVT::f128 ) \|\|
	(Op0.getOpcode() != ISD::BUILD_PAIR) \|\|
	(Op0.getOperand(0).getValueType() != MVT::i64) \|\|
	(Op0.getOperand(1).getValueType() != MVT::i64))
	return SDValue();

	return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
	Op0.getOperand(1));
	}

	static const SDValue *getNormalLoadInput(const SDValue &Op) {
	const SDValue *InputLoad = &Op;
	if (InputLoad->getOpcode() == ISD::BITCAST)
	InputLoad = &InputLoad->getOperand(0);
	if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
	InputLoad = &InputLoad->getOperand(0);
	if (InputLoad->getOpcode() != ISD::LOAD)
	return nullptr;
	LoadSDNode LD = cast<LoadSDNode>(InputLoad);
	return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
	}

	// If this is a case we can't handle, return null and let the default
	// expansion code take care of it. If we CAN select this case, and if it
	// selects to a single instruction, return Op. Otherwise, if we can codegen
	// this case more efficiently than a constant pool load, lower it to the
	// sequence of ops that should be used.
	SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");

	if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
	// We first build an i32 vector, load it into a QPX register,
	// then convert it to a floating-point vector and compare it
	// to a zero vector to get the boolean result.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	assert(BVN->getNumOperands() == 4 &&
	"BUILD_VECTOR for v4i1 does not have 4 operands");

	bool IsConst = true;
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef()) continue;
	if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
	IsConst = false;
	break;
	}
	}

	if (IsConst) {
	Constant *One =
	ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
	Constant *NegOne =
	ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);

	Constant *CV[4];
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef())
	CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
	else if (isNullConstant(BVN->getOperand(i)))
	CV[i] = NegOne;
	else
	CV[i] = One;
	}

	Constant *CP = ConstantVector::get(CV);
	SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
	16 /* alignment */);

	SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
	SDVTList VTs = DAG.getVTList({MVT::v4i1, /chain/ MVT::Other});
	return DAG.getMemIntrinsicNode(
	PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}

	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef()) continue;

	unsigned Offset = 4*i;
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
	if (StoreSize > 4) {
	Stores.push_back(
	DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
	PtrInfo.getWithOffset(Offset), MVT::i32));
	} else {
	SDValue StoreValue = BVN->getOperand(i);
	if (StoreSize < 4)
	StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);

	Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
	PtrInfo.getWithOffset(Offset)));
	}
	}

	SDValue StoreChain;
	if (!Stores.empty())
	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
	else
	StoreChain = DAG.getEntryNode();

	// Now load from v4i32 into the QPX register; this will extend it to
	// v4i64 but not yet convert it to a floating point. Nevertheless, this
	// is typed as v4f64 because the QPX register integer states are not
	// explicitly represented.

	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
	FIdx};
	SDVTList VTs = DAG.getVTList({MVT::v4f64, /chain/ MVT::Other});

	SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);
	LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
	LoadedVect);

	SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);

	return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
	}

	// All other QPX vectors are handled by generic code.
	if (Subtarget.hasQPX())
	return SDValue();

	// Check if this is a splat of a constant value.
	APInt APSplatBits, APSplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
	HasAnyUndefs, 0, !Subtarget.isLittleEndian()) \|\|
	SplatBitSize > 32) {

	const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
	// Handle load-and-splat patterns as we have instructions that will do this
	// in one go.
	if (InputLoad && DAG.isSplatValue(Op, true)) {
	LoadSDNode LD = cast<LoadSDNode>(InputLoad);

	// We have handling for 4 and 8 byte elements.
	unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();

	// Checking for a single use of this load, we have to check for vector
	// width (128 bits) / ElementSize uses (since each operand of the
	// BUILD_VECTOR is a separate use of the value.
	if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
	((Subtarget.hasVSX() && ElementSize == 64) \|\|
	(Subtarget.hasP9Vector() && ElementSize == 32))) {
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr(), // Ptr
	DAG.getValueType(Op.getValueType()) // VT
	};
	return
	DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
	DAG.getVTList(Op.getValueType(), MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());
	}
	}

	// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
	// lowered to VSX instructions under certain conditions.
	// Without VSX, there is no pattern more efficient than expanding the node.
	if (Subtarget.hasVSX() &&
	haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
	Subtarget.hasP8Vector()))
	return Op;
	return SDValue();
	}

	unsigned SplatBits = APSplatBits.getZExtValue();
	unsigned SplatUndef = APSplatUndef.getZExtValue();
	unsigned SplatSize = SplatBitSize / 8;

	// First, handle single instruction cases.

	// All zeros?
	if (SplatBits == 0) {
	// Canonicalize all zero vectors to be v4i32.
	if (Op.getValueType() != MVT::v4i32 \|\| HasAnyUndefs) {
	SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
	Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
	}
	return Op;
	}

	// We have XXSPLTIB for constant splats one byte wide
	// FIXME: SplatBits is an unsigned int being cast to an int while passing it
	// as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.
	if (Subtarget.hasP9Vector() && SplatSize == 1)
	return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);

	// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
	int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
	(32-SplatBitSize));
	if (SextVal >= -16 && SextVal <= 15)
	return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);

	// Two instruction sequences.

	// If this value is in the range [-32,30] and is even, use:
	// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
	// If this value is in the range [17,31] and is odd, use:
	// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
	// If this value is in the range [-31,-17] and is odd, use:
	// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
	// Note the last two are three-instruction sequences.
	if (SextVal >= -32 && SextVal <= 31) {
	// To avoid having these optimizations undone by constant folding,
	// we convert to a pseudo that will be expanded later into one of
	// the above forms.
	SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
	EVT VT = (SplatSize == 1 ? MVT::v16i8 :
	(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
	SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
	SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
	if (VT == Op.getValueType())
	return RetVal;
	else
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
	}

	// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
	// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
	// for fneg/fabs.
	if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
	// Make -1 and vspltisw -1:
	SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);

	// Make the VSLW intrinsic, computing 0x8000_0000.
	SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
	OnesV, DAG, dl);

	// xor by OnesV to invert it.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// Check to see if this is a wide variety of vsplti*, binop self cases.
	static const signed char SplatCsts[] = {
	-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
	-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
	};

	for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
	// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
	// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
	int i = SplatCsts[idx];

	// Figure out what shift amount will be used by altivec if shifted by i in
	// this splat size.
	unsigned TypeShiftAmt = i & (SplatBitSize-1);

	// vsplti + shl self.
	if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
	Intrinsic::ppc_altivec_vslw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + srl self.
	if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
	Intrinsic::ppc_altivec_vsrw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + sra self.
	if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
	Intrinsic::ppc_altivec_vsraw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + rol self.
	if (SextVal == (int)(((unsigned)i << TypeShiftAmt) \|
	((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
	Intrinsic::ppc_altivec_vrlw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// t = vsplti c, result = vsldoi t, t, 1
	if (SextVal == (int)(((unsigned)i << 8) \| (i < 0 ? 0xFF : 0))) {
	SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	// t = vsplti c, result = vsldoi t, t, 2
	if (SextVal == (int)(((unsigned)i << 16) \| (i < 0 ? 0xFFFF : 0))) {
	SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	// t = vsplti c, result = vsldoi t, t, 3
	if (SextVal == (int)(((unsigned)i << 24) \| (i < 0 ? 0xFFFFFF : 0))) {
	SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	}

	return SDValue();
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VMRGHW,
	OP_VMRGLW,
	OP_VSPLTISW0,
	OP_VSPLTISW1,
	OP_VSPLTISW2,
	OP_VSPLTISW3,
	OP_VSLDOI4,
	OP_VSLDOI8,
	OP_VSLDOI12
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (19+2)9+3) return LHS;
	assert(LHSID == ((49+5)9+6)*9+7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);

	int ShufIdxs[16];
	switch (OpNum) {
	default: llvm_unreachable("Unknown i32 permute!");
	case OP_VMRGHW:
	ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
	ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
	ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
	ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
	break;
	case OP_VMRGLW:
	ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
	ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
	ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
	ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
	break;
	case OP_VSPLTISW0:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+0;
	break;
	case OP_VSPLTISW1:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+4;
	break;
	case OP_VSPLTISW2:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+8;
	break;
	case OP_VSPLTISW3:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+12;
	break;
	case OP_VSLDOI4:
	return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
	case OP_VSLDOI8:
	return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
	case OP_VSLDOI12:
	return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
	}
	EVT VT = OpLHS.getValueType();
	OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
	OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
	SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
	return DAG.getNode(ISD::BITCAST, dl, VT, T);
	}

	/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
	/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
	/// SDValue.
	SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
	SelectionDAG &DAG) const {
	const unsigned BytesInVector = 16;
	bool IsLE = Subtarget.isLittleEndian();
	SDLoc dl(N);
	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);
	unsigned ShiftElts = 0, InsertAtByte = 0;
	bool Swap = false;

	// Shifts required to get the byte we want at element 7.
	unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
	0, 15, 14, 13, 12, 11, 10, 9};
	unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
	1, 2, 3, 4, 5, 6, 7, 8};

	ArrayRef<int> Mask = N->getMask();
	int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

	// For each mask element, find out if we're just inserting something
	// from V2 into V1 or vice versa.
	// Possible permutations inserting an element from V2 into V1:
	// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	// ...
	// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
	// Inserting from V1 into V2 will be similar, except mask range will be
	// [16,31].

	bool FoundCandidate = false;
	// If both vector operands for the shuffle are the same vector, the mask
	// will contain only elements from the first one and the second one will be
	// undef.
	unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
	// Go through the mask of half-words to find an element that's being moved
	// from one vector to the other.
	for (unsigned i = 0; i < BytesInVector; ++i) {
	unsigned CurrentElement = Mask[i];
	// If 2nd operand is undefined, we should only look for element 7 in the
	// Mask.
	if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
	continue;

	bool OtherElementsInOrder = true;
	// Examine the other elements in the Mask to see if they're in original
	// order.
	for (unsigned j = 0; j < BytesInVector; ++j) {
	if (j == i)
	continue;
	// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
	// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
	// in which we always assume we're always picking from the 1st operand.
	int MaskOffset =
	(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
	if (Mask[j] != OriginalOrder[j] + MaskOffset) {
	OtherElementsInOrder = false;
	break;
	}
	}
	// If other elements are in original order, we record the number of shifts
	// we need to get the element we want into element 7. Also record which byte
	// in the vector we should insert into.
	if (OtherElementsInOrder) {
	// If 2nd operand is undefined, we assume no shifts and no swapping.
	if (V2.isUndef()) {
	ShiftElts = 0;
	Swap = false;
	} else {
	// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
	ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
	: BigEndianShifts[CurrentElement & 0xF];
	Swap = CurrentElement < BytesInVector;
	}
	InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
	FoundCandidate = true;
	break;
	}
	}

	if (!FoundCandidate)
	return SDValue();

	// Candidate found, construct the proper SDAG sequence with VINSERTB,
	// optionally with VECSHL if shift is required.
	if (Swap)
	std::swap(V1, V2);
	if (V2.isUndef())
	V2 = V1;
	if (ShiftElts) {
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}
	return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}

	/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
	/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
	/// SDValue.
	SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
	SelectionDAG &DAG) const {
	const unsigned NumHalfWords = 8;
	const unsigned BytesInVector = NumHalfWords * 2;
	// Check that the shuffle is on half-words.
	if (!isNByteElemShuffleMask(N, 2, 1))
	return SDValue();

	bool IsLE = Subtarget.isLittleEndian();
	SDLoc dl(N);
	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);
	unsigned ShiftElts = 0, InsertAtByte = 0;
	bool Swap = false;

	// Shifts required to get the half-word we want at element 3.
	unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
	unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};

	uint32_t Mask = 0;
	uint32_t OriginalOrderLow = 0x1234567;
	uint32_t OriginalOrderHigh = 0x89ABCDEF;
	// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
	// 32-bit space, only need 4-bit nibbles per element.
	for (unsigned i = 0; i < NumHalfWords; ++i) {
	unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
	Mask \|= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
	}

	// For each mask element, find out if we're just inserting something
	// from V2 into V1 or vice versa. Possible permutations inserting an element
	// from V2 into V1:
	// X, 1, 2, 3, 4, 5, 6, 7
	// 0, X, 2, 3, 4, 5, 6, 7
	// 0, 1, X, 3, 4, 5, 6, 7
	// 0, 1, 2, X, 4, 5, 6, 7
	// 0, 1, 2, 3, X, 5, 6, 7
	// 0, 1, 2, 3, 4, X, 6, 7
	// 0, 1, 2, 3, 4, 5, X, 7
	// 0, 1, 2, 3, 4, 5, 6, X
	// Inserting from V1 into V2 will be similar, except mask range will be [8,15].

	bool FoundCandidate = false;
	// Go through the mask of half-words to find an element that's being moved
	// from one vector to the other.
	for (unsigned i = 0; i < NumHalfWords; ++i) {
	unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
	uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
	uint32_t MaskOtherElts = ~(0xF << MaskShift);
	uint32_t TargetOrder = 0x0;

	// If both vector operands for the shuffle are the same vector, the mask
	// will contain only elements from the first one and the second one will be
	// undef.
	if (V2.isUndef()) {
	ShiftElts = 0;
	unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
	TargetOrder = OriginalOrderLow;
	Swap = false;
	// Skip if not the correct element or mask of other elements don't equal
	// to our expected order.
	if (MaskOneElt == VINSERTHSrcElem &&
	(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
	InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
	FoundCandidate = true;
	break;
	}
	} else { // If both operands are defined.
	// Target order is [8,15] if the current mask is between [0,7].
	TargetOrder =
	(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
	// Skip if mask of other elements don't equal our expected order.
	if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
	// We only need the last 3 bits for the number of shifts.
	ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
	: BigEndianShifts[MaskOneElt & 0x7];
	InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
	Swap = MaskOneElt < NumHalfWords;
	FoundCandidate = true;
	break;
	}
	}
	}

	if (!FoundCandidate)
	return SDValue();

	// Candidate found, construct the proper SDAG sequence with VINSERTH,
	// optionally with VECSHL if shift is required.
	if (Swap)
	std::swap(V1, V2);
	if (V2.isUndef())
	V2 = V1;
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
	if (ShiftElts) {
	// Double ShiftElts because we're left shifting on v16i8 type.
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
	DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}

	/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
	/// is a shuffle we can handle in a single instruction, return it. Otherwise,
	/// return the code it can be lowered into. Worst case, it can always be
	/// lowered into a vperm.
	SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	EVT VT = Op.getValueType();
	bool isLittleEndian = Subtarget.isLittleEndian();

	unsigned ShiftElts, InsertAtByte;
	bool Swap = false;

	// If this is a load-and-splat, we can do that with a single instruction
	// in some cases. However if the load has multiple uses, we don't want to
	// combine it because that will just produce multiple loads.
	const SDValue *InputLoad = getNormalLoadInput(V1);
	if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
	(PPC::isSplatShuffleMask(SVOp, 4) \|\| PPC::isSplatShuffleMask(SVOp, 8)) &&
	InputLoad->hasOneUse()) {
	bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
	int SplatIdx =
	PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);

	LoadSDNode LD = cast<LoadSDNode>(InputLoad);
	// For 4-byte load-and-splat, we need Power9.
	if ((IsFourByte && Subtarget.hasP9Vector()) \|\| !IsFourByte) {
	uint64_t Offset = 0;
	if (IsFourByte)
	Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
	else
	Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
	SDValue BasePtr = LD->getBasePtr();
	if (Offset != 0)
	BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	BasePtr, DAG.getIntPtrConstant(Offset, dl));
	SDValue Ops[] = {
	LD->getChain(), // Chain
	BasePtr, // BasePtr
	DAG.getValueType(Op.getValueType()) // VT
	};
	SDVTList VTL =
	DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
	SDValue LdSplt =
	DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
	Ops, LD->getMemoryVT(), LD->getMemOperand());
	if (LdSplt.getValueType() != SVOp->getValueType(0))
	LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
	return LdSplt;
	}
	}
	if (Subtarget.hasP9Vector() &&
	PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
	isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
	if (ShiftElts) {
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}

	if (Subtarget.hasP9Altivec()) {
	SDValue NewISDNode;
	if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
	return NewISDNode;

	if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
	return NewISDNode;
	}

	if (Subtarget.hasVSX() &&
	PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Conv2 =
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);

	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
	}

	if (Subtarget.hasVSX() &&
	PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
	SDValue Conv2 =
	DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);

	SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
	}

	if (Subtarget.hasP9Vector()) {
	if (PPC::isXXBRHShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
	SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
	} else if (PPC::isXXBRWShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
	} else if (PPC::isXXBRDShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
	SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
	} else if (PPC::isXXBRQShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
	SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
	}
	}

	if (Subtarget.hasVSX()) {
	if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
	int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);

	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
	DAG.getConstant(SplatIdx, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
	}

	// Left shifts of 8 bytes are actually swaps. Convert accordingly.
	if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
	SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
	}
	}

	if (Subtarget.hasQPX()) {
	if (VT.getVectorNumElements() != 4)
	return SDValue();

	if (V2.isUndef()) V2 = V1;

	int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
	if (AlignIdx != -1) {
	return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
	DAG.getConstant(AlignIdx, dl, MVT::i32));
	} else if (SVOp->isSplat()) {
	int SplatIdx = SVOp->getSplatIndex();
	if (SplatIdx >= 4) {
	std::swap(V1, V2);
	SplatIdx -= 4;
	}

	return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
	DAG.getConstant(SplatIdx, dl, MVT::i32));
	}

	// Lower this into a qvgpci/qvfperm pair.

	// Compute the qvgpci literal
	unsigned idx = 0;
	for (unsigned i = 0; i < 4; ++i) {
	int m = SVOp->getMaskElt(i);
	unsigned mm = m >= 0 ? (unsigned) m : i;
	idx \|= mm << (3-i)*3;
	}

	SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
	DAG.getConstant(idx, dl, MVT::i32));
	return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
	}

	// Cases that are handled by instructions that take permute immediates
	// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
	// selected by the instruction selector.
	if (V2.isUndef()) {
	if (PPC::isSplatShuffleMask(SVOp, 1) \|\|
	PPC::isSplatShuffleMask(SVOp, 2) \|\|
	PPC::isSplatShuffleMask(SVOp, 4) \|\|
	PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 \|\|
	PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) \|\|
	(Subtarget.hasP8Altivec() && (
	PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
	return Op;
	}
	}

	// Altivec has a variety of "shuffle immediates" that take two vector inputs
	// and produce a fixed permutation. If any of these match, do not lower to
	// VPERM.
	unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
	if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 \|\|
	PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) \|\|
	(Subtarget.hasP8Altivec() && (
	PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
	return Op;

	// Check to see if this is a shuffle of 4-byte values. If so, we can use our
	// perfect shuffle table to emit an optimal matching sequence.
	ArrayRef<int> PermMask = SVOp->getMask();

	unsigned PFIndexes[4];
	bool isFourElementShuffle = true;
	for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
	unsigned EltNo = 8; // Start out undef.
	for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
	if (PermMask[i*4+j] < 0)
	continue; // Undef, ignore it.

	unsigned ByteSource = PermMask[i*4+j];
	if ((ByteSource & 3) != j) {
	isFourElementShuffle = false;
	break;
	}

	if (EltNo == 8) {
	EltNo = ByteSource/4;
	} else if (EltNo != ByteSource/4) {
	isFourElementShuffle = false;
	break;
	}
	}
	PFIndexes[i] = EltNo;
	}

	// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
	// perfect shuffle vector to determine if it is cost effective to do this as
	// discrete instructions, or whether we should use a vperm.
	// For now, we skip this for little endian until such time as we have a
	// little-endian perfect shuffle table.
	if (isFourElementShuffle && !isLittleEndian) {
	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex =
	PFIndexes[0]999+PFIndexes[1]99+PFIndexes[2]9+PFIndexes[3];

	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	// Determining when to avoid vperm is tricky. Many things affect the cost
	// of vperm, particularly how many times the perm mask needs to be computed.
	// For example, if the perm mask can be hoisted out of a loop or is already
	// used (perhaps because there are multiple permutes with the same shuffle
	// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
	// the loop requires an extra register.
	//
	// As a compromise, we only emit discrete instructions if the shuffle can be
	// generated in 3 or fewer operations. When we have loop information
	// available, if this block is within a loop, we should avoid using vperm
	// for 3-operation perms and use a constant pool load instead.
	if (Cost < 3)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
	// vector that will get spilled to the constant pool.
	if (V2.isUndef()) V2 = V1;

	// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
	// that it is in input element units, not in bytes. Convert now.

	// For little endian, the order of the input vectors is reversed, and
	// the permutation mask is complemented with respect to 31. This is
	// necessary to produce proper semantics with the big-endian-biased vperm
	// instruction.
	EVT EltVT = V1.getValueType().getVectorElementType();
	unsigned BytesPerElement = EltVT.getSizeInBits()/8;

	SmallVector<SDValue, 16> ResultMask;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];

	for (unsigned j = 0; j != BytesPerElement; ++j)
	if (isLittleEndian)
	ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
	dl, MVT::i32));
	else
	ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
	MVT::i32));
	}

	SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
	if (isLittleEndian)
	return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
	V2, V1, VPermMask);
	else
	return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
	V1, V2, VPermMask);
	}

	/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
	/// vector comparison. If it is, return true and fill in Opc/isDot with
	/// information about the intrinsic.
	static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
	bool &isDot, const PPCSubtarget &Subtarget) {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
	CompareOpc = -1;
	isDot = false;
	switch (IntrinsicID) {
	default:
	return false;
	// Comparison predicates.
	case Intrinsic::ppc_altivec_vcmpbfp_p:
	CompareOpc = 966;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpeqfp_p:
	CompareOpc = 198;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequb_p:
	CompareOpc = 6;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequh_p:
	CompareOpc = 70;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequw_p:
	CompareOpc = 134;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequd_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 199;
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpneb_p:
	case Intrinsic::ppc_altivec_vcmpneh_p:
	case Intrinsic::ppc_altivec_vcmpnew_p:
	case Intrinsic::ppc_altivec_vcmpnezb_p:
	case Intrinsic::ppc_altivec_vcmpnezh_p:
	case Intrinsic::ppc_altivec_vcmpnezw_p:
	if (Subtarget.hasP9Altivec()) {
	switch (IntrinsicID) {
	default:
	llvm_unreachable("Unknown comparison intrinsic.");
	case Intrinsic::ppc_altivec_vcmpneb_p:
	CompareOpc = 7;
	break;
	case Intrinsic::ppc_altivec_vcmpneh_p:
	CompareOpc = 71;
	break;
	case Intrinsic::ppc_altivec_vcmpnew_p:
	CompareOpc = 135;
	break;
	case Intrinsic::ppc_altivec_vcmpnezb_p:
	CompareOpc = 263;
	break;
	case Intrinsic::ppc_altivec_vcmpnezh_p:
	CompareOpc = 327;
	break;
	case Intrinsic::ppc_altivec_vcmpnezw_p:
	CompareOpc = 391;
	break;
	}
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgefp_p:
	CompareOpc = 454;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtfp_p:
	CompareOpc = 710;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsb_p:
	CompareOpc = 774;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsh_p:
	CompareOpc = 838;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsw_p:
	CompareOpc = 902;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsd_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 967;
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgtub_p:
	CompareOpc = 518;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuh_p:
	CompareOpc = 582;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuw_p:
	CompareOpc = 646;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtud_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 711;
	isDot = true;
	} else
	return false;
	break;

	// VSX predicate comparisons use the same infrastructure
	case Intrinsic::ppc_vsx_xvcmpeqdp_p:
	case Intrinsic::ppc_vsx_xvcmpgedp_p:
	case Intrinsic::ppc_vsx_xvcmpgtdp_p:
	case Intrinsic::ppc_vsx_xvcmpeqsp_p:
	case Intrinsic::ppc_vsx_xvcmpgesp_p:
	case Intrinsic::ppc_vsx_xvcmpgtsp_p:
	if (Subtarget.hasVSX()) {
	switch (IntrinsicID) {
	case Intrinsic::ppc_vsx_xvcmpeqdp_p:
	CompareOpc = 99;
	break;
	case Intrinsic::ppc_vsx_xvcmpgedp_p:
	CompareOpc = 115;
	break;
	case Intrinsic::ppc_vsx_xvcmpgtdp_p:
	CompareOpc = 107;
	break;
	case Intrinsic::ppc_vsx_xvcmpeqsp_p:
	CompareOpc = 67;
	break;
	case Intrinsic::ppc_vsx_xvcmpgesp_p:
	CompareOpc = 83;
	break;
	case Intrinsic::ppc_vsx_xvcmpgtsp_p:
	CompareOpc = 75;
	break;
	}
	isDot = true;
	} else
	return false;
	break;

	// Normal Comparisons.
	case Intrinsic::ppc_altivec_vcmpbfp:
	CompareOpc = 966;
	break;
	case Intrinsic::ppc_altivec_vcmpeqfp:
	CompareOpc = 198;
	break;
	case Intrinsic::ppc_altivec_vcmpequb:
	CompareOpc = 6;
	break;
	case Intrinsic::ppc_altivec_vcmpequh:
	CompareOpc = 70;
	break;
	case Intrinsic::ppc_altivec_vcmpequw:
	CompareOpc = 134;
	break;
	case Intrinsic::ppc_altivec_vcmpequd:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 199;
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpneb:
	case Intrinsic::ppc_altivec_vcmpneh:
	case Intrinsic::ppc_altivec_vcmpnew:
	case Intrinsic::ppc_altivec_vcmpnezb:
	case Intrinsic::ppc_altivec_vcmpnezh:
	case Intrinsic::ppc_altivec_vcmpnezw:
	if (Subtarget.hasP9Altivec())
	switch (IntrinsicID) {
	default:
	llvm_unreachable("Unknown comparison intrinsic.");
	case Intrinsic::ppc_altivec_vcmpneb:
	CompareOpc = 7;
	break;
	case Intrinsic::ppc_altivec_vcmpneh:
	CompareOpc = 71;
	break;
	case Intrinsic::ppc_altivec_vcmpnew:
	CompareOpc = 135;
	break;
	case Intrinsic::ppc_altivec_vcmpnezb:
	CompareOpc = 263;
	break;
	case Intrinsic::ppc_altivec_vcmpnezh:
	CompareOpc = 327;
	break;
	case Intrinsic::ppc_altivec_vcmpnezw:
	CompareOpc = 391;
	break;
	}
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgefp:
	CompareOpc = 454;
	break;
	case Intrinsic::ppc_altivec_vcmpgtfp:
	CompareOpc = 710;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsb:
	CompareOpc = 774;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsh:
	CompareOpc = 838;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsw:
	CompareOpc = 902;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsd:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 967;
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgtub:
	CompareOpc = 518;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuh:
	CompareOpc = 582;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuw:
	CompareOpc = 646;
	break;
	case Intrinsic::ppc_altivec_vcmpgtud:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 711;
	else
	return false;
	break;
	}
	return true;
	}

	/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
	/// lower, do it, otherwise return null.
	SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	SDLoc dl(Op);

	if (IntrinsicID == Intrinsic::thread_pointer) {
	// Reads the thread pointer register, used for __builtin_thread_pointer.
	if (Subtarget.isPPC64())
	return DAG.getRegister(PPC::X13, MVT::i64);
	return DAG.getRegister(PPC::R2, MVT::i32);
	}

	// If this is a lowered altivec predicate compare, CompareOpc is set to the
	// opcode number of the comparison.
	int CompareOpc;
	bool isDot;
	if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
	return SDValue(); // Don't custom lower most intrinsics.

	// If this is a non-dot comparison, make the VCMP node and we are done.
	if (!isDot) {
	SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
	Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(CompareOpc, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
	}

	// Create the PPCISD altivec 'dot' comparison node.
	SDValue Ops[] = {
	Op.getOperand(2), // LHS
	Op.getOperand(3), // RHS
	DAG.getConstant(CompareOpc, dl, MVT::i32)
	};
	EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
	SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);

	// Now that we have the comparison, emit a copy from the CR to a GPR.
	// This is flagged to the above dot comparison.
	SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
	DAG.getRegister(PPC::CR6, MVT::i32),
	CompNode.getValue(1));

	// Unpack the result based on how the target uses it.
	unsigned BitNo; // Bit # of CR6.
	bool InvertBit; // Invert result?
	switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
	default: // Can't happen, don't crash on invalid number though.
	case 0: // Return the value of the EQ bit of CR6.
	BitNo = 0; InvertBit = false;
	break;
	case 1: // Return the inverted value of the EQ bit of CR6.
	BitNo = 0; InvertBit = true;
	break;
	case 2: // Return the value of the LT bit of CR6.
	BitNo = 2; InvertBit = false;
	break;
	case 3: // Return the inverted value of the LT bit of CR6.
	BitNo = 2; InvertBit = true;
	break;
	}

	// Shift the bit into the low position.
	Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
	DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
	// Isolate the bit.
	Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
	DAG.getConstant(1, dl, MVT::i32));

	// If we are supposed to, toggle the bit.
	if (InvertBit)
	Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
	DAG.getConstant(1, dl, MVT::i32));
	return Flags;
	}

	SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
	SelectionDAG &DAG) const {
	// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
	// the beginning of the argument list.
	int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
	SDLoc DL(Op);
	switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
	case Intrinsic::ppc_cfence: {
	assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
	assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
	return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
	DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
	Op.getOperand(ArgStart + 1)),
	Op.getOperand(0)),
	0);
	}
	default:
	break;
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
	// Check for a DIV with the same operands as this REM.
	for (auto UI : Op.getOperand(1)->uses()) {
	if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) \|\|
	(Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
	if (UI->getOperand(0) == Op.getOperand(0) &&
	UI->getOperand(1) == Op.getOperand(1))
	return SDValue();
	}
	return Op;
	}

	// Lower scalar BSWAP64 to xxbrd.
	SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// MTVSRDD
	Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
	Op.getOperand(0));
	// XXBRD
	Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);
	// MFVSRD
	int VectorIndex = 0;
	if (Subtarget.isLittleEndian())
	VectorIndex = 1;
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
	DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
	return Op;
	}

	// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
	// compared to a value that is atomically loaded (atomic loads zero-extend).
	SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
	"Expecting an atomic compare-and-swap here.");
	SDLoc dl(Op);
	auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
	EVT MemVT = AtomicNode->getMemoryVT();
	if (MemVT.getSizeInBits() >= 32)
	return Op;

	SDValue CmpOp = Op.getOperand(2);
	// If this is already correctly zero-extended, leave it alone.
	auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
	if (DAG.MaskedValueIsZero(CmpOp, HighBits))
	return Op;

	// Clear the high bits of the compare operand.
	unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
	SDValue NewCmpOp =
	DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
	DAG.getConstant(MaskVal, dl, MVT::i32));

	// Replace the existing compare operand with the properly zero-extended one.
	SmallVector<SDValue, 4> Ops;
	for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
	Ops.push_back(AtomicNode->getOperand(i));
	Ops[2] = NewCmpOp;
	MachineMemOperand *MMO = AtomicNode->getMemOperand();
	SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
	auto NodeTy =
	(MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
	return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
	}

	SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// Create a stack slot that is 16-byte aligned.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	// Store the input value into Value#0 of the stack slot.
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
	MachinePointerInfo());
	// Load it out.
	return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
	"Should only be called for ISD::INSERT_VECTOR_ELT");

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	// We have legal lowering for constant indices but not for variable ones.
	if (!C)
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
	if (VT == MVT::v8i16 \|\| VT == MVT::v16i8) {
	SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
	unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
	unsigned InsertAtElement = C->getZExtValue();
	unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
	if (Subtarget.isLittleEndian()) {
	InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
	}
	return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}
	return Op;
	}

	SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDNode *N = Op.getNode();

	assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
	"Unknown extract_vector_elt type");

	SDValue Value = N->getOperand(0);

	// The first part of this is like the store lowering except that we don't
	// need to track the chain.

	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
	// understand how to form the extending load.
	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	// Now convert to an integer and store.
	Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
	Value);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue StoreChain = DAG.getEntryNode();
	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
	Value, FIdx};
	SDVTList VTs = DAG.getVTList(/chain/ MVT::Other);

	StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);

	// Extract the value requested.
	unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	SDValue IntVal =
	DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));

	if (!Subtarget.useCRBits())
	return IntVal;

	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
	}

	/// Lowering for QPX v4i1 loads
	SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
	SDValue LoadChain = LN->getChain();
	SDValue BasePtr = LN->getBasePtr();

	if (Op.getValueType() == MVT::v4f64 \|\|
	Op.getValueType() == MVT::v4f32) {
	EVT MemVT = LN->getMemoryVT();
	unsigned Alignment = LN->getAlignment();

	// If this load is properly aligned, then it is legal.
	if (Alignment >= MemVT.getStoreSize())
	return Op;

	EVT ScalarVT = Op.getValueType().getScalarType(),
	ScalarMemVT = MemVT.getScalarType();
	unsigned Stride = ScalarMemVT.getStoreSize();

	SDValue Vals[4], LoadChains[4];
	for (unsigned Idx = 0; Idx < 4; ++Idx) {
	SDValue Load;
	if (ScalarVT != ScalarMemVT)
	Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
	BasePtr,
	LN->getPointerInfo().getWithOffset(Idx * Stride),
	ScalarMemVT, MinAlign(Alignment, Idx * Stride),
	LN->getMemOperand()->getFlags(), LN->getAAInfo());
	else
	Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
	LN->getPointerInfo().getWithOffset(Idx * Stride),
	MinAlign(Alignment, Idx * Stride),
	LN->getMemOperand()->getFlags(), LN->getAAInfo());

	if (Idx == 0 && LN->isIndexed()) {
	assert(LN->getAddressingMode() == ISD::PRE_INC &&
	"Unknown addressing mode on vector load");
	Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
	LN->getAddressingMode());
	}

	Vals[Idx] = Load;
	LoadChains[Idx] = Load.getValue(1);

	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, dl,
	BasePtr.getValueType()));
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
	SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);

	if (LN->isIndexed()) {
	SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
	return DAG.getMergeValues(RetOps, dl);
	}

	SDValue RetOps[] = { Value, TF };
	return DAG.getMergeValues(RetOps, dl);
	}

	assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
	assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");

	// To lower v4i1 from a byte array, we load the byte elements of the
	// vector and then reuse the BUILD_VECTOR logic.

	SDValue VectElmts[4], VectElmtChains[4];
	for (unsigned i = 0; i < 4; ++i) {
	SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);

	VectElmts[i] = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
	LN->getPointerInfo().getWithOffset(i), MVT::i8,
	/* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
	VectElmtChains[i] = VectElmts[i].getValue(1);
	}

	LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
	SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);

	SDValue RVals[] = { Value, LoadChain };
	return DAG.getMergeValues(RVals, dl);
	}

	/// Lowering for QPX v4i1 stores
	SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
	SDValue StoreChain = SN->getChain();
	SDValue BasePtr = SN->getBasePtr();
	SDValue Value = SN->getValue();

	if (Value.getValueType() == MVT::v4f64 \|\|
	Value.getValueType() == MVT::v4f32) {
	EVT MemVT = SN->getMemoryVT();
	unsigned Alignment = SN->getAlignment();

	// If this store is properly aligned, then it is legal.
	if (Alignment >= MemVT.getStoreSize())
	return Op;

	EVT ScalarVT = Value.getValueType().getScalarType(),
	ScalarMemVT = MemVT.getScalarType();
	unsigned Stride = ScalarMemVT.getStoreSize();

	SDValue Stores[4];
	for (unsigned Idx = 0; Idx < 4; ++Idx) {
	SDValue Ex = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
	DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
	SDValue Store;
	if (ScalarVT != ScalarMemVT)
	Store =
	DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
	SN->getPointerInfo().getWithOffset(Idx * Stride),
	ScalarMemVT, MinAlign(Alignment, Idx * Stride),
	SN->getMemOperand()->getFlags(), SN->getAAInfo());
	else
	Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
	SN->getPointerInfo().getWithOffset(Idx * Stride),
	MinAlign(Alignment, Idx * Stride),
	SN->getMemOperand()->getFlags(), SN->getAAInfo());

	if (Idx == 0 && SN->isIndexed()) {
	assert(SN->getAddressingMode() == ISD::PRE_INC &&
	"Unknown addressing mode on vector store");
	Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
	SN->getAddressingMode());
	}

	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, dl,
	BasePtr.getValueType()));
	Stores[Idx] = Store;
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	if (SN->isIndexed()) {
	SDValue RetOps[] = { TF, Stores[0].getValue(1) };
	return DAG.getMergeValues(RetOps, dl);
	}

	return TF;
	}

	assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
	assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");

	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
	// understand how to form the extending load.
	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	// Now convert to an integer and store.
	Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
	Value);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
	Value, FIdx};
	SDVTList VTs = DAG.getVTList(/chain/ MVT::Other);

	StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);

	// Move data into the byte array.
	SDValue Loads[4], LoadChains[4];
	for (unsigned i = 0; i < 4; ++i) {
	unsigned Offset = 4*i;
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
	PtrInfo.getWithOffset(Offset));
	LoadChains[i] = Loads[i].getValue(1);
	}

	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);

	SDValue Stores[4];
	for (unsigned i = 0; i < 4; ++i) {
	SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);

	Stores[i] = DAG.getTruncStore(
	StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
	MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
	SN->getAAInfo());
	}

	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	return StoreChain;
	}

	SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	if (Op.getValueType() == MVT::v4i32) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

	SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
	SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.

	SDValue RHSSwap = // = vrlw RHS, 16
	BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);

	// Shrinkify inputs to v8i16.
	LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
	RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
	RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);

	// Low parts multiplied together, generating 32-bit results (we ignore the
	// top parts).
	SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
	LHS, RHS, DAG, dl, MVT::v4i32);

	SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
	LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
	// Shift the high parts up 16 bits.
	HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
	Neg16, DAG, dl);
	return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
	} else if (Op.getValueType() == MVT::v8i16) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

	SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);

	return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
	LHS, RHS, Zero, DAG, dl);
	} else if (Op.getValueType() == MVT::v16i8) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
	bool isLittleEndian = Subtarget.isLittleEndian();

	// Multiply the even 8-bit parts, producing 16-bit sums.
	SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
	LHS, RHS, DAG, dl, MVT::v8i16);
	EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);

	// Multiply the odd 8-bit parts, producing 16-bit sums.
	SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
	LHS, RHS, DAG, dl, MVT::v8i16);
	OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);

	// Merge the results together. Because vmuleub and vmuloub are
	// instructions with a big-endian bias, we must reverse the
	// element numbering and reverse the meaning of "odd" and "even"
	// when generating little endian code.
	int Ops[16];
	for (unsigned i = 0; i != 8; ++i) {
	if (isLittleEndian) {
	Ops[i2 ] = 2i;
	Ops[i2+1] = 2i+16;
	} else {
	Ops[i2 ] = 2i+1;
	Ops[i2+1] = 2i+1+16;
	}
	}
	if (isLittleEndian)
	return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
	else
	return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
	} else {
	llvm_unreachable("Unknown mul to lower!");
	}
	}

	SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {

	assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");

	EVT VT = Op.getValueType();
	assert(VT.isVector() &&
	"Only set vector abs as custom, scalar abs shouldn't reach here!");
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v16i8) &&
	"Unexpected vector element type!");
	assert((VT != MVT::v2i64 \|\| Subtarget.hasP8Altivec()) &&
	"Current subtarget doesn't support smax v2i64!");

	// For vector abs, it can be lowered to:
	// abs x
	// ==>
	// y = -x
	// smax(x, y)

	SDLoc dl(Op);
	SDValue X = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);

	// SMAX patch https://reviews.llvm.org/D47332
	// hasn't landed yet, so use intrinsic first here.
	// TODO: Should use SMAX directly once SMAX patch landed
	Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
	if (VT == MVT::v2i64)
	BifID = Intrinsic::ppc_altivec_vmaxsd;
	else if (VT == MVT::v8i16)
	BifID = Intrinsic::ppc_altivec_vmaxsh;
	else if (VT == MVT::v16i8)
	BifID = Intrinsic::ppc_altivec_vmaxsb;

	return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
	}

	// Custom lowering for fpext vf32 to v2f64
	SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

	assert(Op.getOpcode() == ISD::FP_EXTEND &&
	"Should only be called for ISD::FP_EXTEND");

	+ // FIXME: handle extends from half precision float vectors on P9.
	// We only want to custom lower an extend from v2f32 to v2f64.
	if (Op.getValueType() != MVT::v2f64 \|\|
	Op.getOperand(0).getValueType() != MVT::v2f32)
	return SDValue();

	SDLoc dl(Op);
	SDValue Op0 = Op.getOperand(0);

	switch (Op0.getOpcode()) {
	default:
	return SDValue();
	case ISD::EXTRACT_SUBVECTOR: {
	assert(Op0.getNumOperands() == 2 &&
	isa<ConstantSDNode>(Op0->getOperand(1)) &&
	"Node should have 2 operands with second one being a constant!");

	if (Op0.getOperand(0).getValueType() != MVT::v4f32)
	return SDValue();

	// Custom lower is only done for high or low doubleword.
	int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (Idx % 2 != 0)
	return SDValue();

	// Since input is v4f32, at this point Idx is either 0 or 2.
	// Shift to get the doubleword position we want.
	int DWord = Idx >> 1;

	// High and low word positions are different on little endian.
	if (Subtarget.isLittleEndian())
	DWord ^= 0x1;

	return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
	Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
	}
	case ISD::FADD:
	case ISD::FMUL:
	case ISD::FSUB: {
	SDValue NewLoad[2];
	for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
	// Ensure both input are loads.
	SDValue LdOp = Op0.getOperand(i);
	if (LdOp.getOpcode() != ISD::LOAD)
	return SDValue();
	// Generate new load node.
	LoadSDNode *LD = cast<LoadSDNode>(LdOp);
	SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
	NewLoad[i] = DAG.getMemIntrinsicNode(
	PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
	LD->getMemoryVT(), LD->getMemOperand());
	}
	SDValue NewOp =
	DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
	NewLoad[1], Op0.getNode()->getFlags());
	return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
	DAG.getConstant(0, dl, MVT::i32));
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(Op0);
	SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
	SDValue NewLd = DAG.getMemIntrinsicNode(
	PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
	LD->getMemoryVT(), LD->getMemOperand());
	return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
	DAG.getConstant(0, dl, MVT::i32));
	}
	}
	llvm_unreachable("ERROR:Should return for all cases within swtich.");
	}

	/// LowerOperation - Provide custom lowering hooks for some operations.
	///
	SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Wasn't expecting to be able to lower this!");
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

	// Variable argument lowering.
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, DAG);

	case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);

	// Exception handling lowering.
	case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

	case ISD::LOAD: return LowerLOAD(Op, DAG);
	case ISD::STORE: return LowerSTORE(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

	// Lower 64-bit shifts.
	case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
	case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
	case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);

	// Vector-related lowering.
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

	// For counter-based loop handling.
	case ISD::INTRINSIC_W_CHAIN: return SDValue();

	case ISD::BITCAST: return LowerBITCAST(Op, DAG);

	// Frame & Return address.
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

	case ISD::INTRINSIC_VOID:
	return LowerINTRINSIC_VOID(Op, DAG);
	case ISD::SREM:
	case ISD::UREM:
	return LowerREM(Op, DAG);
	case ISD::BSWAP:
	return LowerBSWAP(Op, DAG);
	case ISD::ATOMIC_CMP_SWAP:
	return LowerATOMIC_CMP_SWAP(Op, DAG);
	}
	}

	void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case ISD::READCYCLECOUNTER: {
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
	SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));

	Results.push_back(RTB);
	Results.push_back(RTB.getValue(1));
	Results.push_back(RTB.getValue(2));
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
	Intrinsic::loop_decrement)
	break;

	assert(N->getValueType(0) == MVT::i1 &&
	"Unexpected result type for CTR decrement intrinsic");
	EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	N->getValueType(0));
	SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
	SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
	N->getOperand(1));

	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
	Results.push_back(NewInt.getValue(1));
	break;
	}
	case ISD::VAARG: {
	if (!Subtarget.isSVR4ABI() \|\| Subtarget.isPPC64())
	return;

	EVT VT = N->getValueType(0);

	if (VT == MVT::i64) {
	SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);

	Results.push_back(NewNode);
	Results.push_back(NewNode.getValue(1));
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	// LowerFP_TO_INT() can only handle f32 and f64.
	if (N->getOperand(0).getValueType() == MVT::ppcf128)
	return;
	Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
	return;
	case ISD::TRUNCATE: {
	EVT TrgVT = N->getValueType(0);
	EVT OpVT = N->getOperand(0).getValueType();
	if (TrgVT.isVector() &&
	isOperationCustom(N->getOpcode(), TrgVT) &&
	OpVT.getSizeInBits() <= 128 &&
	isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
	Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
	return;
	}
	case ISD::BITCAST:
	// Don't handle bitcast here.
	return;
	+ case ISD::FP_EXTEND:
	+ SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
	+ if (Lowered)
	+ Results.push_back(Lowered);
	+ return;
	}
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Function *Func = Intrinsic::getDeclaration(M, Id);
	return Builder.CreateCall(Func, {});
	}

	// The mappings for emitLeading/TrailingFence is taken from
	// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
	Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (Ord == AtomicOrdering::SequentiallyConsistent)
	return callIntrinsic(Builder, Intrinsic::ppc_sync);
	if (isReleaseOrStronger(Ord))
	return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
	return nullptr;
	}

	Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
	// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
	// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
	// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
	if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
	return Builder.CreateCall(
	Intrinsic::getDeclaration(
	Builder.GetInsertBlock()->getParent()->getParent(),
	Intrinsic::ppc_cfence, {Inst->getType()}),
	{Inst});
	// FIXME: Can use isync for rmw operation.
	return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
	}
	return nullptr;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
	unsigned AtomicSize,
	unsigned BinOpcode,
	unsigned CmpOpcode,
	unsigned CmpPred) const {
	// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	auto LoadMnemonic = PPC::LDARX;
	auto StoreMnemonic = PPC::STDCX;
	switch (AtomicSize) {
	default:
	llvm_unreachable("Unexpected size of atomic entity");
	case 1:
	LoadMnemonic = PPC::LBARX;
	StoreMnemonic = PPC::STBCX;
	assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
	break;
	case 2:
	LoadMnemonic = PPC::LHARX;
	StoreMnemonic = PPC::STHCX;
	assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
	break;
	case 4:
	LoadMnemonic = PPC::LWARX;
	StoreMnemonic = PPC::STWCX;
	break;
	case 8:
	LoadMnemonic = PPC::LDARX;
	StoreMnemonic = PPC::STDCX;
	break;
	}

	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction *F = BB->getParent();
	MachineFunction::iterator It = ++BB->getIterator();

	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register incr = MI.getOperand(3).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB =
	CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loopMBB);
	if (CmpOpcode)
	F->insert(It, loop2MBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register TmpReg = (!BinOpcode) ? incr :
	RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
	: &PPC::GPRCRegClass);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);

	// loopMBB:
	// l[wd]arx dest, ptr
	// add r0, dest, incr
	// st[wd]cx. r0, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB

	// For max/min...
	// loopMBB:
	// l[wd]arx dest, ptr
	// cmpl?[wd] incr, dest
	// bgt exitMBB
	// loop2MBB:
	// st[wd]cx. dest, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB

	BB = loopMBB;
	BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
	.addReg(ptrA).addReg(ptrB);
	if (BinOpcode)
	BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
	if (CmpOpcode) {
	// Signed comparisons of byte or halfword values must be sign-extended.
	if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
	Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
	BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
	ExtReg).addReg(dest);
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(incr).addReg(ExtReg);
	} else
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(incr).addReg(dest);

	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(exitMBB);
	BB = loop2MBB;
	}
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	return BB;
	}

	MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
	MachineInstr &MI, MachineBasicBlock *BB,
	bool is8bit, // operation
	unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
	// If we support part-word atomic mnemonics, just use them
	if (Subtarget.hasPartwordAtomics())
	return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
	CmpPred);

	// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// In 64 bit mode we have to use 64 bits for addresses, even though the
	// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
	// registers without caring whether they're 32 or 64, but here we're
	// doing actual arithmetic on the addresses.
	bool is64bit = Subtarget.isPPC64();
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;

	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction *F = BB->getParent();
	MachineFunction::iterator It = ++BB->getIterator();

	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register incr = MI.getOperand(3).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB =
	CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loopMBB);
	if (CmpOpcode)
	F->insert(It, loop2MBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC =
	is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;

	Register PtrReg = RegInfo.createVirtualRegister(RC);
	Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
	Register ShiftReg =
	isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
	Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
	Register MaskReg = RegInfo.createVirtualRegister(GPRC);
	Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
	Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
	Register Ptr1Reg;
	Register TmpReg =
	(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);

	// The 4-byte load must be aligned, while a char or short may be
	// anywhere in the word. Hence all this nasty bookkeeping code.
	// add ptr1, ptrA, ptrB [copy if ptrA==0]
	// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
	// xori shift, shift1, 24 [16]
	// rlwinm ptr, ptr1, 0, 0, 29
	// slw incr2, incr, shift
	// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
	// slw mask, mask2, shift
	// loopMBB:
	// lwarx tmpDest, ptr
	// add tmp, tmpDest, incr2
	// andc tmp2, tmpDest, mask
	// and tmp3, tmp, mask
	// or tmp4, tmp3, tmp2
	// stwcx. tmp4, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB
	// srw dest, tmpDest, shift
	if (ptrA != ZeroReg) {
	Ptr1Reg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
	.addReg(ptrA)
	.addReg(ptrB);
	} else {
	Ptr1Reg = ptrB;
	}
	// We need use 32-bit subregister to avoid mismatch register class in 64-bit
	// mode.
	BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
	.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
	.addImm(3)
	.addImm(27)
	.addImm(is8bit ? 28 : 27);
	if (!isLittleEndian)
	BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
	.addReg(Shift1Reg)
	.addImm(is8bit ? 24 : 16);
	if (is64bit)
	BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(61);
	else
	BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(0)
	.addImm(29);
	BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);
	if (is8bit)
	BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
	else {
	BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
	BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
	.addReg(Mask3Reg)
	.addImm(65535);
	}
	BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
	.addReg(Mask2Reg)
	.addReg(ShiftReg);

	BB = loopMBB;
	BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	if (BinOpcode)
	BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
	.addReg(Incr2Reg)
	.addReg(TmpDestReg);
	BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);
	if (CmpOpcode) {
	// For unsigned comparisons, we can directly compare the shifted values.
	// For signed comparisons we shift and sign extend.
	Register SReg = RegInfo.createVirtualRegister(GPRC);
	BuildMI(BB, dl, TII->get(PPC::AND), SReg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	unsigned ValueReg = SReg;
	unsigned CmpReg = Incr2Reg;
	if (CmpOpcode == PPC::CMPW) {
	ValueReg = RegInfo.createVirtualRegister(GPRC);
	BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
	.addReg(SReg)
	.addReg(ShiftReg);
	Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
	BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
	.addReg(ValueReg);
	ValueReg = ValueSReg;
	CmpReg = incr;
	}
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(CmpReg)
	.addReg(ValueReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(CmpPred)
	.addReg(PPC::CR0)
	.addMBB(exitMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(exitMBB);
	BB = loop2MBB;
	}
	BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(Tmp4Reg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(loopMBB);
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
	.addReg(TmpDestReg)
	.addReg(ShiftReg);
	return BB;
	}

	llvm::MachineBasicBlock *
	PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();

	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	Register DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register restoreDstReg = MRI.createVirtualRegister(RC);

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");
	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// SjLjSetup mainMBB
	// bl mainMBB
	// v_restore = 1
	// b sinkMBB
	//
	// mainMBB:
	// buf[LabelOffset] = LR
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// Note that the structure of the jmp_buf used here is not compatible
	// with that used by libc, and is not designed to be. Specifically, it
	// stores only those 'reserved' registers that LLVM does not otherwise
	// understand how to spill. Also, by convention, by the time this
	// intrinsic is called, Clang has already stored the frame address in the
	// first slot of the buffer and stack address in the third. Following the
	// X86 target code, we'll store the jump address in the second slot. We also
	// need to save the TOC pointer (R2) to handle jumps between shared
	// libraries, and that will be stored in the fourth slot. The thread
	// identifier (R13) is not affected.

	// thisMBB:
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t TOCOffset = 3 * PVT.getStoreSize();
	const int64_t BPOffset = 4 * PVT.getStoreSize();

	// Prepare IP either in reg.
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	Register LabelReg = MRI.createVirtualRegister(PtrRC);
	Register BufReg = MI.getOperand(1).getReg();

	if (Subtarget.is64BitELFABI()) {
	setUsesTOCBasePtr(*MBB->getParent());
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
	.addReg(PPC::X2)
	.addImm(TOCOffset)
	.addReg(BufReg)
	.cloneMemRefs(MI);
	}

	// Naked functions never have a base pointer, and so we use r1. For all
	// other functions, this decision must be delayed until during PEI.
	unsigned BaseReg;
	if (MF->getFunction().hasFnAttribute(Attribute::Naked))
	BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
	else
	BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;

	MIB = BuildMI(*thisMBB, MI, DL,
	TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
	.addReg(BaseReg)
	.addImm(BPOffset)
	.addReg(BufReg)
	.cloneMemRefs(MI);

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
	MIB.addRegMask(TRI->getNoPreservedMask());

	BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);

	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
	.addMBB(mainMBB);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);

	thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
	thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());

	// mainMBB:
	// mainDstReg = 0
	MIB =
	BuildMI(mainMBB, DL,
	TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);

	// Store IP
	if (Subtarget.isPPC64()) {
	MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
	.addReg(LabelReg)
	.addImm(LabelOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
	.addReg(LabelReg)
	.addImm(LabelOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(PPC::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(thisMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	Register Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
	unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
	unsigned BP =
	(PVT == MVT::i64)
	? PPC::X30
	: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
	: PPC::R30);

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();
	const int64_t TOCOffset = 3 * PVT.getStoreSize();
	const int64_t BPOffset = 4 * PVT.getStoreSize();

	Register BufReg = MI.getOperand(0).getReg();

	// Reload FP (the jumped-to function may not have had a
	// frame pointer, and if so, then its r31 will be restored
	// as necessary).
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
	.addImm(0)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
	.addImm(0)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload IP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
	.addImm(LabelOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
	.addImm(LabelOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload SP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
	.addImm(SPOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
	.addImm(SPOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload BP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
	.addImm(BPOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
	.addImm(BPOffset)
	.addReg(BufReg);
	}
	MIB.cloneMemRefs(MI);

	// Reload TOC
	if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
	setUsesTOCBasePtr(*MBB->getParent());
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
	.addImm(TOCOffset)
	.addReg(BufReg)
	.cloneMemRefs(MI);
	}

	// Jump
	BuildMI(*MBB, MI, DL,
	TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
	BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));

	MI.eraseFromParent();
	return MBB;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	if (MI.getOpcode() == TargetOpcode::STACKMAP \|\|
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	if (Subtarget.is64BitELFABI() &&
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	// Call lowering should have added an r2 operand to indicate a dependence
	// on the TOC base pointer value. It can't however, because there is no
	// way to mark the dependence as implicit there, and so the stackmap code
	// will confuse it with a regular operand. Instead, add the dependence
	// here.
	MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
	}

	return emitPatchPoint(MI, BB);
	}

	if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 \|\|
	MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
	return emitEHSjLjSetJmp(MI, BB);
	} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 \|\|
	MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
	return emitEHSjLjLongJmp(MI, BB);
	}

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// To "insert" these instructions we actually have to insert their
	// control-flow patterns.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineFunction *F = BB->getParent();

	if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8 \|\| MI.getOpcode() == PPC::SELECT_I4 \|\|
	MI.getOpcode() == PPC::SELECT_I8) {
	SmallVector<MachineOperand, 2> Cond;
	if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8)
	Cond.push_back(MI.getOperand(4));
	else
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
	Cond.push_back(MI.getOperand(1));

	DebugLoc dl = MI.getDebugLoc();
	TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
	MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
	} else if (MI.getOpcode() == PPC::SELECT_CC_F4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_F8 \|\|
	MI.getOpcode() == PPC::SELECT_CC_F16 \|\|
	MI.getOpcode() == PPC::SELECT_CC_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_SPE4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_SPE \|\|
	MI.getOpcode() == PPC::SELECT_F4 \|\|
	MI.getOpcode() == PPC::SELECT_F8 \|\|
	MI.getOpcode() == PPC::SELECT_F16 \|\|
	MI.getOpcode() == PPC::SELECT_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_SPE \|\|
	MI.getOpcode() == PPC::SELECT_SPE4 \|\|
	MI.getOpcode() == PPC::SELECT_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_VSRC) {
	// The incoming instruction knows the destination vreg to set, the
	// condition code register to branch on, the true/false values to
	// select between, and a branch opcode to use.

	// thisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	DebugLoc dl = MI.getDebugLoc();
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Next, add the true and fallthrough blocks as its successors.
	BB->addSuccessor(copy0MBB);
	BB->addSuccessor(sinkMBB);

	if (MI.getOpcode() == PPC::SELECT_I4 \|\| MI.getOpcode() == PPC::SELECT_I8 \|\|
	MI.getOpcode() == PPC::SELECT_F4 \|\| MI.getOpcode() == PPC::SELECT_F8 \|\|
	MI.getOpcode() == PPC::SELECT_F16 \|\|
	MI.getOpcode() == PPC::SELECT_SPE4 \|\|
	MI.getOpcode() == PPC::SELECT_SPE \|\|
	MI.getOpcode() == PPC::SELECT_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_VSRC) {
	BuildMI(BB, dl, TII->get(PPC::BC))
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	} else {
	unsigned SelectPred = MI.getOperand(4).getImm();
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(SelectPred)
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	}

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	BB = copy0MBB;

	// Update machine-CFG edges
	BB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
	// ...
	BB = sinkMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
	.addReg(MI.getOperand(3).getReg())
	.addMBB(copy0MBB)
	.addReg(MI.getOperand(2).getReg())
	.addMBB(thisMBB);
	} else if (MI.getOpcode() == PPC::ReadTB) {
	// To read the 64-bit time-base register on a 32-bit target, we read the
	// two halves. Should the counter have wrapped while it was being read, we
	// need to try again.
	// ...
	// readLoop:
	// mfspr Rx,TBU # load from TBU
	// mfspr Ry,TB # load from TB
	// mfspr Rz,TBU # load from TBU
	// cmpw crX,Rx,Rz # check if 'old'='new'
	// bne readLoop # branch if they're not equal
	// ...

	MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	DebugLoc dl = MI.getDebugLoc();
	F->insert(It, readMBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(readMBB);
	BB = readMBB;

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
	Register LoReg = MI.getOperand(0).getReg();
	Register HiReg = MI.getOperand(1).getReg();

	BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
	BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
	BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);

	Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);

	BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
	.addReg(HiReg)
	.addReg(ReadAgainReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(CmpReg)
	.addMBB(readMBB);

	BB->addSuccessor(readMBB);
	BB->addSuccessor(sinkMBB);
	} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);

	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 \|\|
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 \|\|
	(Subtarget.hasPartwordAtomics() &&
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) \|\|
	(Subtarget.hasPartwordAtomics() &&
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
	bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;

	auto LoadMnemonic = PPC::LDARX;
	auto StoreMnemonic = PPC::STDCX;
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Compare and swap of unknown size");
	case PPC::ATOMIC_CMP_SWAP_I8:
	LoadMnemonic = PPC::LBARX;
	StoreMnemonic = PPC::STBCX;
	assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
	break;
	case PPC::ATOMIC_CMP_SWAP_I16:
	LoadMnemonic = PPC::LHARX;
	StoreMnemonic = PPC::STHCX;
	assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
	break;
	case PPC::ATOMIC_CMP_SWAP_I32:
	LoadMnemonic = PPC::LWARX;
	StoreMnemonic = PPC::STWCX;
	break;
	case PPC::ATOMIC_CMP_SWAP_I64:
	LoadMnemonic = PPC::LDARX;
	StoreMnemonic = PPC::STDCX;
	break;
	}
	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register oldval = MI.getOperand(3).getReg();
	Register newval = MI.getOperand(4).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loop1MBB);
	F->insert(It, loop2MBB);
	F->insert(It, midMBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loop1MBB);

	// loop1MBB:
	// l[bhwd]arx dest, ptr
	// cmp[wd] dest, oldval
	// bne- midMBB
	// loop2MBB:
	// st[bhwd]cx. newval, ptr
	// bne- loopMBB
	// b exitBB
	// midMBB:
	// st[bhwd]cx. dest, ptr
	// exitBB:
	BB = loop1MBB;
	BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
	.addReg(oldval)
	.addReg(dest);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(midMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(midMBB);

	BB = loop2MBB;
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(newval)
	.addReg(ptrA)
	.addReg(ptrB);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(loop1MBB);
	BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
	BB->addSuccessor(loop1MBB);
	BB->addSuccessor(exitMBB);

	BB = midMBB;
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(dest)
	.addReg(ptrA)
	.addReg(ptrB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 \|\|
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
	// We must use 64-bit registers for addresses when targeting 64-bit,
	// since we're actually doing arithmetic on them. Other registers
	// can be 32-bit.
	bool is64bit = Subtarget.isPPC64();
	bool isLittleEndian = Subtarget.isLittleEndian();
	bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;

	Register dest = MI.getOperand(0).getReg();
	Register ptrA = MI.getOperand(1).getReg();
	Register ptrB = MI.getOperand(2).getReg();
	Register oldval = MI.getOperand(3).getReg();
	Register newval = MI.getOperand(4).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loop1MBB);
	F->insert(It, loop2MBB);
	F->insert(It, midMBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC =
	is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;

	Register PtrReg = RegInfo.createVirtualRegister(RC);
	Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
	Register ShiftReg =
	isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
	Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
	Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
	Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
	Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
	Register MaskReg = RegInfo.createVirtualRegister(GPRC);
	Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
	Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
	Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
	Register Ptr1Reg;
	Register TmpReg = RegInfo.createVirtualRegister(GPRC);
	Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loop1MBB);

	// The 4-byte load must be aligned, while a char or short may be
	// anywhere in the word. Hence all this nasty bookkeeping code.
	// add ptr1, ptrA, ptrB [copy if ptrA==0]
	// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
	// xori shift, shift1, 24 [16]
	// rlwinm ptr, ptr1, 0, 0, 29
	// slw newval2, newval, shift
	// slw oldval2, oldval,shift
	// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
	// slw mask, mask2, shift
	// and newval3, newval2, mask
	// and oldval3, oldval2, mask
	// loop1MBB:
	// lwarx tmpDest, ptr
	// and tmp, tmpDest, mask
	// cmpw tmp, oldval3
	// bne- midMBB
	// loop2MBB:
	// andc tmp2, tmpDest, mask
	// or tmp4, tmp2, newval3
	// stwcx. tmp4, ptr
	// bne- loop1MBB
	// b exitBB
	// midMBB:
	// stwcx. tmpDest, ptr
	// exitBB:
	// srw dest, tmpDest, shift
	if (ptrA != ZeroReg) {
	Ptr1Reg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
	.addReg(ptrA)
	.addReg(ptrB);
	} else {
	Ptr1Reg = ptrB;
	}

	// We need use 32-bit subregister to avoid mismatch register class in 64-bit
	// mode.
	BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
	.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
	.addImm(3)
	.addImm(27)
	.addImm(is8bit ? 28 : 27);
	if (!isLittleEndian)
	BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
	.addReg(Shift1Reg)
	.addImm(is8bit ? 24 : 16);
	if (is64bit)
	BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(61);
	else
	BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
	.addReg(Ptr1Reg)
	.addImm(0)
	.addImm(0)
	.addImm(29);
	BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
	.addReg(newval)
	.addReg(ShiftReg);
	BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
	.addReg(oldval)
	.addReg(ShiftReg);
	if (is8bit)
	BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
	else {
	BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
	BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
	.addReg(Mask3Reg)
	.addImm(65535);
	}
	BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
	.addReg(Mask2Reg)
	.addReg(ShiftReg);
	BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
	.addReg(NewVal2Reg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
	.addReg(OldVal2Reg)
	.addReg(MaskReg);

	BB = loop1MBB;
	BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
	.addReg(TmpReg)
	.addReg(OldVal3Reg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(midMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(midMBB);

	BB = loop2MBB;
	BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
	.addReg(TmpDestReg)
	.addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)
	.addReg(Tmp2Reg)
	.addReg(NewVal3Reg);
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(Tmp4Reg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE)
	.addReg(PPC::CR0)
	.addMBB(loop1MBB);
	BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
	BB->addSuccessor(loop1MBB);
	BB->addSuccessor(exitMBB);

	BB = midMBB;
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(TmpDestReg)
	.addReg(ZeroReg)
	.addReg(PtrReg);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
	.addReg(TmpReg)
	.addReg(ShiftReg);
	} else if (MI.getOpcode() == PPC::FADDrtz) {
	// This pseudo performs an FADD with rounding mode temporarily forced
	// to round-to-zero. We emit this via custom inserter since the FPSCR
	// is not modeled at the SelectionDAG level.
	Register Dest = MI.getOperand(0).getReg();
	Register Src1 = MI.getOperand(1).getReg();
	Register Src2 = MI.getOperand(2).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);

	// Set rounding mode to round-to-zero.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);

	// Perform addition.
	BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);

	// Restore FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
	} else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
	unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
	? PPC::ANDI8_rec
	: PPC::ANDI_rec;
	bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT \|\|
	MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register Dest = RegInfo.createVirtualRegister(
	Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);

	DebugLoc Dl = MI.getDebugLoc();
	BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)
	.addReg(MI.getOperand(1).getReg())
	.addImm(1);
	BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);
	} else if (MI.getOpcode() == PPC::TCHECK_RET) {
	DebugLoc Dl = MI.getDebugLoc();
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
	BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
	BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(CRReg);
	} else if (MI.getOpcode() == PPC::TBEGIN_RET) {
	DebugLoc Dl = MI.getDebugLoc();
	unsigned Imm = MI.getOperand(1).getImm();
	BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
	BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(PPC::CR0EQ);
	} else if (MI.getOpcode() == PPC::SETRNDi) {
	DebugLoc dl = MI.getDebugLoc();
	Register OldFPSCRReg = MI.getOperand(0).getReg();

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);

	// The floating point rounding mode is in the bits 62:63 of FPCSR, and has
	// the following settings:
	// 00 Round to nearest
	// 01 Round to 0
	// 10 Round to +inf
	// 11 Round to -inf

	// When the operand is immediate, using the two least significant bits of
	// the immediate to set the bits 62:63 of FPSCR.
	unsigned Mode = MI.getOperand(1).getImm();
	BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
	.addImm(31);

	BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
	.addImm(30);
	} else if (MI.getOpcode() == PPC::SETRND) {
	DebugLoc dl = MI.getDebugLoc();

	// Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
	// or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
	// If the target doesn't have DirectMove, we should use stack to do the
	// conversion, because the target doesn't have the instructions like mtvsrd
	// or mfvsrd to do this conversion directly.
	auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
	if (Subtarget.hasDirectMove()) {
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
	.addReg(SrcReg);
	} else {
	// Use stack to do the register copy.
	unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
	if (RC == &PPC::F8RCRegClass) {
	// Copy register from F8RCRegClass to G8RCRegclass.
	assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
	"Unsupported RegClass.");

	StoreOp = PPC::STFD;
	LoadOp = PPC::LD;
	} else {
	// Copy register from G8RCRegClass to F8RCRegclass.
	assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
	(RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
	"Unsupported RegClass.");
	}

	MachineFrameInfo &MFI = F->getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(8, 8, false);

	MachineMemOperand *MMOStore = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
	MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
	MFI.getObjectAlignment(FrameIdx));

	// Store the SrcReg into the stack.
	BuildMI(*BB, MI, dl, TII->get(StoreOp))
	.addReg(SrcReg)
	.addImm(0)
	.addFrameIndex(FrameIdx)
	.addMemOperand(MMOStore);

	MachineMemOperand *MMOLoad = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
	MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
	MFI.getObjectAlignment(FrameIdx));

	// Load from the stack where SrcReg is stored, and save to DestReg,
	// so we have done the RegClass conversion from RegClass::SrcReg to
	// RegClass::DestReg.
	BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
	.addImm(0)
	.addFrameIndex(FrameIdx)
	.addMemOperand(MMOLoad);
	}
	};

	Register OldFPSCRReg = MI.getOperand(0).getReg();

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);

	// When the operand is gprc register, use two least significant bits of the
	// register and mtfsf instruction to set the bits 62:63 of FPSCR.
	//
	// copy OldFPSCRTmpReg, OldFPSCRReg
	// (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
	// rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
	// copy NewFPSCRReg, NewFPSCRTmpReg
	// mtfsf 255, NewFPSCRReg
	MachineOperand SrcOp = MI.getOperand(1);
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);

	copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);

	Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
	Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);

	// The first operand of INSERT_SUBREG should be a register which has
	// subregisters, we only care about its RegClass, so we should use an
	// IMPLICIT_DEF register.
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
	BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
	.addReg(ImDefReg)
	.add(SrcOp)
	.addImm(1);

	Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
	BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
	.addReg(OldFPSCRTmpReg)
	.addReg(ExtSrcReg)
	.addImm(0)
	.addImm(62);

	Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
	copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);

	// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
	// bits of FPSCR.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
	.addImm(255)
	.addReg(NewFPSCRReg)
	.addImm(0)
	.addImm(0);
	} else {
	llvm_unreachable("Unexpected instr type to insert");
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	//===----------------------------------------------------------------------===//
	// Target Optimization Hooks
	//===----------------------------------------------------------------------===//

	static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
	// For the estimates, convergence is quadratic, so we essentially double the
	// number of digits correct after every iteration. For both FRE and FRSQRTE,
	// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
	// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
	int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
	if (VT.getScalarType() == MVT::f64)
	RefinementSteps++;
	return RefinementSteps;
	}

	SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Operand.getValueType();
	if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) \|\|
	(VT == MVT::f64 && Subtarget.hasFRSQRTE()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasAltivec()) \|\|
	(VT == MVT::v2f64 && Subtarget.hasVSX()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasQPX()) \|\|
	(VT == MVT::v4f64 && Subtarget.hasQPX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);

	// The Newton-Raphson computation with a single constant does not provide
	// enough accuracy on some CPUs.
	UseOneConstNR = !Subtarget.needsTwoConstNR();
	return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Operand.getValueType();
	if ((VT == MVT::f32 && Subtarget.hasFRES()) \|\|
	(VT == MVT::f64 && Subtarget.hasFRE()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasAltivec()) \|\|
	(VT == MVT::v2f64 && Subtarget.hasVSX()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasQPX()) \|\|
	(VT == MVT::v4f64 && Subtarget.hasQPX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
	return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
	}
	return SDValue();
	}

	unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
	// Note: This functionality is used only when unsafe-fp-math is enabled, and
	// on cores with reciprocal estimates (which are used when unsafe-fp-math is
	// enabled for division), this functionality is redundant with the default
	// combiner logic (once the division -> reciprocal/multiply transformation
	// has taken place). As a result, this matters more for older cores than for
	// newer ones.

	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are two or more FDIVs (for embedded cores with only
	// one FP pipeline) for three or more FDIVs (for generic OOO cores).
	switch (Subtarget.getCPUDirective()) {
	default:
	return 3;
	case PPC::DIR_440:
	case PPC::DIR_A2:
	case PPC::DIR_E500:
	case PPC::DIR_E500mc:
	case PPC::DIR_E5500:
	return 2;
	}
	}

	// isConsecutiveLSLoc needs to work even if all adds have not yet been
	// collapsed, and so we need to look through chains of them.
	static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
	int64_t& Offset, SelectionDAG &DAG) {
	if (DAG.isBaseWithConstantOffset(Loc)) {
	Base = Loc.getOperand(0);
	Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();

	// The base might itself be a base plus an offset, and if so, accumulate
	// that as well.
	getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
	}
	}

	static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
	unsigned Bytes, int Dist,
	SelectionDAG &DAG) {
	if (VT.getSizeInBits() / 8 != Bytes)
	return false;

	SDValue BaseLoc = Base->getBasePtr();
	if (Loc.getOpcode() == ISD::FrameIndex) {
	if (BaseLoc.getOpcode() != ISD::FrameIndex)
	return false;
	const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
	int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
	int FS = MFI.getObjectSize(FI);
	int BFS = MFI.getObjectSize(BFI);
	if (FS != BFS \|\| FS != (int)Bytes) return false;
	return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
	}

	SDValue Base1 = Loc, Base2 = BaseLoc;
	int64_t Offset1 = 0, Offset2 = 0;
	getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
	getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
	if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
	return true;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const GlobalValue *GV1 = nullptr;
	const GlobalValue *GV2 = nullptr;
	Offset1 = 0;
	Offset2 = 0;
	bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
	bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
	if (isGA1 && isGA2 && GV1 == GV2)
	return Offset1 == (Offset2 + Dist*Bytes);
	return false;
	}

	// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
	// not enforce equality of the chain operands.
	static bool isConsecutiveLS(SDNode N, LSBaseSDNode Base,
	unsigned Bytes, int Dist,
	SelectionDAG &DAG) {
	if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
	EVT VT = LS->getMemoryVT();
	SDValue Loc = LS->getBasePtr();
	return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
	}

	if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
	EVT VT;
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default: return false;
	case Intrinsic::ppc_qpx_qvlfd:
	case Intrinsic::ppc_qpx_qvlfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfs:
	case Intrinsic::ppc_qpx_qvlfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcd:
	case Intrinsic::ppc_qpx_qvlfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcs:
	case Intrinsic::ppc_qpx_qvlfcsa:
	VT = MVT::v2f32;
	break;
	case Intrinsic::ppc_qpx_qvlfiwa:
	case Intrinsic::ppc_qpx_qvlfiwz:
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvw4x_be:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_lxvd2x:
	case Intrinsic::ppc_vsx_lxvd2x_be:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_altivec_lvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_lvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_lvewx:
	VT = MVT::i32;
	break;
	}

	return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
	}

	if (N->getOpcode() == ISD::INTRINSIC_VOID) {
	EVT VT;
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default: return false;
	case Intrinsic::ppc_qpx_qvstfd:
	case Intrinsic::ppc_qpx_qvstfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	case Intrinsic::ppc_qpx_qvstfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcd:
	case Intrinsic::ppc_qpx_qvstfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcs:
	case Intrinsic::ppc_qpx_qvstfcsa:
	VT = MVT::v2f32;
	break;
	case Intrinsic::ppc_qpx_qvstfiw:
	case Intrinsic::ppc_qpx_qvstfiwa:
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	case Intrinsic::ppc_vsx_stxvw4x:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_vsx_stxvw4x_be:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x_be:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_altivec_stvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_stvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_stvewx:
	VT = MVT::i32;
	break;
	}

	return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
	}

	return false;
	}

	// Return true is there is a nearyby consecutive load to the one provided
	// (regardless of alignment). We search up and down the chain, looking though
	// token factors and other loads (but nothing else). As a result, a true result
	// indicates that it is safe to create a new consecutive load adjacent to the
	// load provided.
	static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
	SDValue Chain = LD->getChain();
	EVT VT = LD->getMemoryVT();

	SmallSet<SDNode *, 16> LoadRoots;
	SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
	SmallSet<SDNode *, 16> Visited;

	// First, search up the chain, branching to follow all token-factor operands.
	// If we find a consecutive load, then we're done, otherwise, record all
	// nodes just above the top-level loads and token factors.
	while (!Queue.empty()) {
	SDNode *ChainNext = Queue.pop_back_val();
	if (!Visited.insert(ChainNext).second)
	continue;

	if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
	if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
	return true;

	if (!Visited.count(ChainLD->getChain().getNode()))
	Queue.push_back(ChainLD->getChain().getNode());
	} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
	for (const SDUse &O : ChainNext->ops())
	if (!Visited.count(O.getNode()))
	Queue.push_back(O.getNode());
	} else
	LoadRoots.insert(ChainNext);
	}

	// Second, search down the chain, starting from the top-level nodes recorded
	// in the first phase. These top-level nodes are the nodes just above all
	// loads and token factors. Starting with their uses, recursively look though
	// all loads (just the chain uses) and token factors to find a consecutive
	// load.
	Visited.clear();
	Queue.clear();

	for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
	IE = LoadRoots.end(); I != IE; ++I) {
	Queue.push_back(*I);

	while (!Queue.empty()) {
	SDNode *LoadRoot = Queue.pop_back_val();
	if (!Visited.insert(LoadRoot).second)
	continue;

	if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
	if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
	return true;

	for (SDNode::use_iterator UI = LoadRoot->use_begin(),
	UE = LoadRoot->use_end(); UI != UE; ++UI)
	if (((isa<MemSDNode>(*UI) &&
	cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) \|\|
	UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
	Queue.push_back(*UI);
	}
	}

	return false;
	}

	/// This function is called when we have proved that a SETCC node can be replaced
	/// by subtraction (and other supporting instructions) so that the result of
	/// comparison is kept in a GPR instead of CR. This function is purely for
	/// codegen purposes and has some flags to guide the codegen process.
	static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
	bool Swap, SDLoc &DL, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");

	// Zero extend the operands to the largest legal integer. Originally, they
	// must be of a strictly smaller size.
	auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
	DAG.getConstant(Size, DL, MVT::i32));
	auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
	DAG.getConstant(Size, DL, MVT::i32));

	// Swap if needed. Depends on the condition code.
	if (Swap)
	std::swap(Op0, Op1);

	// Subtract extended integers.
	auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);

	// Move the sign bit to the least significant position and zero out the rest.
	// Now the least significant bit carries the result of original comparison.
	auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
	DAG.getConstant(Size - 1, DL, MVT::i32));
	auto Final = Shifted;

	// Complement the result if needed. Based on the condition code.
	if (Complement)
	Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
	DAG.getConstant(1, DL, MVT::i64));

	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
	}

	SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	// Size of integers being compared has a critical role in the following
	// analysis, so we prefer to do this when all types are legal.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// If all users of SETCC extend its value to a legal integer type
	// then we replace SETCC with a subtraction
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); UI != UE; ++UI) {
	if (UI->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	}

	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	auto OpSize = N->getOperand(0).getValueSizeInBits();

	unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();

	if (OpSize < Size) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	return generateEquivalentSub(N, Size, false, false, DL, DAG);
	case ISD::SETULE:
	return generateEquivalentSub(N, Size, true, true, DL, DAG);
	case ISD::SETUGT:
	return generateEquivalentSub(N, Size, false, true, DL, DAG);
	case ISD::SETUGE:
	return generateEquivalentSub(N, Size, true, false, DL, DAG);
	}
	}

	return SDValue();
	}

	SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
	// If we're tracking CR bits, we need to be careful that we don't have:
	// trunc(binary-ops(zext(x), zext(y)))
	// or
	// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
	// such that we're unnecessarily moving things into GPRs when it would be
	// better to keep them in CR bits.

	// Note that trunc here can be an actual i1 trunc, or can be the effective
	// truncation that comes from a setcc or select_cc.
	if (N->getOpcode() == ISD::TRUNCATE &&
	N->getValueType(0) != MVT::i1)
	return SDValue();

	if (N->getOperand(0).getValueType() != MVT::i32 &&
	N->getOperand(0).getValueType() != MVT::i64)
	return SDValue();

	if (N->getOpcode() == ISD::SETCC \|\|
	N->getOpcode() == ISD::SELECT_CC) {
	// If we're looking at a comparison, then we need to make sure that the
	// high bits (all except for the first) don't matter the result.
	ISD::CondCode CC =
	cast<CondCodeSDNode>(N->getOperand(
	N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
	unsigned OpBits = N->getOperand(0).getValueSizeInBits();

	if (ISD::isSignedIntSetCC(CC)) {
	if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits \|\|
	DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
	return SDValue();
	} else if (ISD::isUnsignedIntSetCC(CC)) {
	if (!DAG.MaskedValueIsZero(N->getOperand(0),
	APInt::getHighBitsSet(OpBits, OpBits-1)) \|\|
	!DAG.MaskedValueIsZero(N->getOperand(1),
	APInt::getHighBitsSet(OpBits, OpBits-1)))
	return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
	: SDValue());
	} else {
	// This is neither a signed nor an unsigned comparison, just make sure
	// that the high bits are equal.
	KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));
	KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));

	// We don't really care about what is known about the first bit (if
	// anything), so clear it in all masks prior to comparing them.
	Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
	Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);

	if (Op1Known.Zero != Op2Known.Zero \|\| Op1Known.One != Op2Known.One)
	return SDValue();
	}
	}

	// We now know that the higher-order bits are irrelevant, we just need to
	// make sure that all of the intermediate operations are bit operations, and
	// all inputs are extensions.
	if (N->getOperand(0).getOpcode() != ISD::AND &&
	N->getOperand(0).getOpcode() != ISD::OR &&
	N->getOperand(0).getOpcode() != ISD::XOR &&
	N->getOperand(0).getOpcode() != ISD::SELECT &&
	N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
	N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
	N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
	N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
	return SDValue();

	if ((N->getOpcode() == ISD::SETCC \|\| N->getOpcode() == ISD::SELECT_CC) &&
	N->getOperand(1).getOpcode() != ISD::AND &&
	N->getOperand(1).getOpcode() != ISD::OR &&
	N->getOperand(1).getOpcode() != ISD::XOR &&
	N->getOperand(1).getOpcode() != ISD::SELECT &&
	N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
	N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
	N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
	N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
	N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
	return SDValue();

	SmallVector<SDValue, 4> Inputs;
	SmallVector<SDValue, 8> BinOps, PromOps;
	SmallPtrSet<SDNode *, 16> Visited;

	for (unsigned i = 0; i < 2; ++i) {
	if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
	N->getOperand(i).getOperand(0).getValueType() == MVT::i1) \|\|
	isa<ConstantSDNode>(N->getOperand(i)))
	Inputs.push_back(N->getOperand(i));
	else
	BinOps.push_back(N->getOperand(i));

	if (N->getOpcode() == ISD::TRUNCATE)
	break;
	}

	// Visit all inputs, collect all binary operations (and, or, xor and
	// select) that are all fed by extensions.
	while (!BinOps.empty()) {
	SDValue BinOp = BinOps.back();
	BinOps.pop_back();

	if (!Visited.insert(BinOp.getNode()).second)
	continue;

	PromOps.push_back(BinOp);

	for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
	// The condition of the select is not promoted.
	if (BinOp.getOpcode() == ISD::SELECT && i == 0)
	continue;
	if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
	continue;

	if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
	BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) \|\|
	isa<ConstantSDNode>(BinOp.getOperand(i))) {
	Inputs.push_back(BinOp.getOperand(i));
	} else if (BinOp.getOperand(i).getOpcode() == ISD::AND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::OR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::XOR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC \|\|
	BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
	BinOps.push_back(BinOp.getOperand(i));
	} else {
	// We have an input that is not an extension or another binary
	// operation; we'll abort this transformation.
	return SDValue();
	}
	}
	}

	// Make sure that this is a self-contained cluster of operations (which
	// is not quite the same thing as saying that everything has only one
	// use).
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
	UE = Inputs[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// Make sure that we're not going to promote the non-output-value
	// operand(s) or SELECT or SELECT_CC.
	// FIXME: Although we could sometimes handle this, and it does occur in
	// practice that one of the condition inputs to the select is also one of
	// the outputs, we currently can't deal with this.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == Inputs[i])
	return SDValue();
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == Inputs[i] \|\|
	User->getOperand(1) == Inputs[i])
	return SDValue();
	}
	}
	}

	for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
	for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
	UE = PromOps[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// Make sure that we're not going to promote the non-output-value
	// operand(s) or SELECT or SELECT_CC.
	// FIXME: Although we could sometimes handle this, and it does occur in
	// practice that one of the condition inputs to the select is also one of
	// the outputs, we currently can't deal with this.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == PromOps[i])
	return SDValue();
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == PromOps[i] \|\|
	User->getOperand(1) == PromOps[i])
	return SDValue();
	}
	}
	}

	// Replace all inputs with the extension operand.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	// Constants may have users outside the cluster of to-be-promoted nodes,
	// and so we need to replace those as we do the promotions.
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;
	else
	DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
	}

	std::list<HandleSDNode> PromOpHandles;
	for (auto &PromOp : PromOps)
	PromOpHandles.emplace_back(PromOp);

	// Replace all operations (these are all the same, but have a different
	// (i1) return type). DAG.getNode will validate that the types of
	// a binary operator match, so go through the list in reverse so that
	// we've likely promoted both operands first. Any intermediate truncations or
	// extensions disappear.
	while (!PromOpHandles.empty()) {
	SDValue PromOp = PromOpHandles.back().getValue();
	PromOpHandles.pop_back();

	if (PromOp.getOpcode() == ISD::TRUNCATE \|\|
	PromOp.getOpcode() == ISD::SIGN_EXTEND \|\|
	PromOp.getOpcode() == ISD::ZERO_EXTEND \|\|
	PromOp.getOpcode() == ISD::ANY_EXTEND) {
	if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
	PromOp.getOperand(0).getValueType() != MVT::i1) {
	// The operand is not yet ready (see comment below).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	SDValue RepValue = PromOp.getOperand(0);
	if (isa<ConstantSDNode>(RepValue))
	RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);

	DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
	continue;
	}

	unsigned C;
	switch (PromOp.getOpcode()) {
	default: C = 0; break;
	case ISD::SELECT: C = 1; break;
	case ISD::SELECT_CC: C = 2; break;
	}

	if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
	PromOp.getOperand(C).getValueType() != MVT::i1) \|\|
	(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
	PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
	// The to-be-promoted operands of this node have not yet been
	// promoted (this should be rare because we're going through the
	// list backward, but if one of the operands has several users in
	// this cluster of to-be-promoted nodes, it is possible).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
	PromOp.getNode()->op_end());

	// If there are any constant inputs, make sure they're replaced now.
	for (unsigned i = 0; i < 2; ++i)
	if (isa<ConstantSDNode>(Ops[C+i]))
	Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);

	DAG.ReplaceAllUsesOfValueWith(PromOp,
	DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
	}

	// Now we're left with the initial truncation itself.
	if (N->getOpcode() == ISD::TRUNCATE)
	return N->getOperand(0);

	// Otherwise, this is a comparison. The operands to be compared have just
	// changed type (to i1), but everything else is the same.
	return SDValue(N, 0);
	}

	SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	// If we're tracking CR bits, we need to be careful that we don't have:
	// zext(binary-ops(trunc(x), trunc(y)))
	// or
	// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
	// such that we're unnecessarily moving things into CR bits that can more
	// efficiently stay in GPRs. Note that if we're not certain that the high
	// bits are set as required by the final extension, we still may need to do
	// some masking to get the proper behavior.

	// This same functionality is important on PPC64 when dealing with
	// 32-to-64-bit extensions; these occur often when 32-bit values are used as
	// the return values of functions. Because it is so similar, it is handled
	// here as well.

	if (N->getValueType(0) != MVT::i32 &&
	N->getValueType(0) != MVT::i64)
	return SDValue();

	if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) \|\|
	(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
	return SDValue();

	if (N->getOperand(0).getOpcode() != ISD::AND &&
	N->getOperand(0).getOpcode() != ISD::OR &&
	N->getOperand(0).getOpcode() != ISD::XOR &&
	N->getOperand(0).getOpcode() != ISD::SELECT &&
	N->getOperand(0).getOpcode() != ISD::SELECT_CC)
	return SDValue();

	SmallVector<SDValue, 4> Inputs;
	SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
	SmallPtrSet<SDNode *, 16> Visited;

	// Visit all inputs, collect all binary operations (and, or, xor and
	// select) that are all fed by truncations.
	while (!BinOps.empty()) {
	SDValue BinOp = BinOps.back();
	BinOps.pop_back();

	if (!Visited.insert(BinOp.getNode()).second)
	continue;

	PromOps.push_back(BinOp);

	for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
	// The condition of the select is not promoted.
	if (BinOp.getOpcode() == ISD::SELECT && i == 0)
	continue;
	if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
	continue;

	if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE \|\|
	isa<ConstantSDNode>(BinOp.getOperand(i))) {
	Inputs.push_back(BinOp.getOperand(i));
	} else if (BinOp.getOperand(i).getOpcode() == ISD::AND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::OR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::XOR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
	BinOps.push_back(BinOp.getOperand(i));
	} else {
	// We have an input that is not a truncation or another binary
	// operation; we'll abort this transformation.
	return SDValue();
	}
	}
	}

	// The operands of a select that must be truncated when the select is
	// promoted because the operand is actually part of the to-be-promoted set.
	DenseMap<SDNode *, EVT> SelectTruncOp[2];

	// Make sure that this is a self-contained cluster of operations (which
	// is not quite the same thing as saying that everything has only one
	// use).
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
	UE = Inputs[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// If we're going to promote the non-output-value operand(s) or SELECT or
	// SELECT_CC, record them for truncation.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == Inputs[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == Inputs[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	if (User->getOperand(1) == Inputs[i])
	SelectTruncOp[1].insert(std::make_pair(User,
	User->getOperand(1).getValueType()));
	}
	}
	}

	for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
	for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
	UE = PromOps[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// If we're going to promote the non-output-value operand(s) or SELECT or
	// SELECT_CC, record them for truncation.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == PromOps[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == PromOps[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	if (User->getOperand(1) == PromOps[i])
	SelectTruncOp[1].insert(std::make_pair(User,
	User->getOperand(1).getValueType()));
	}
	}
	}

	unsigned PromBits = N->getOperand(0).getValueSizeInBits();
	bool ReallyNeedsExt = false;
	if (N->getOpcode() != ISD::ANY_EXTEND) {
	// If all of the inputs are not already sign/zero extended, then
	// we'll still need to do that at the end.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	unsigned OpBits =
	Inputs[i].getOperand(0).getValueSizeInBits();
	assert(PromBits < OpBits && "Truncation not to a smaller bit count?");

	if ((N->getOpcode() == ISD::ZERO_EXTEND &&
	!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
	APInt::getHighBitsSet(OpBits,
	OpBits-PromBits))) \|\|
	(N->getOpcode() == ISD::SIGN_EXTEND &&
	DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
	(OpBits-(PromBits-1)))) {
	ReallyNeedsExt = true;
	break;
	}
	}
	}

	// Replace all inputs, either with the truncation operand, or a
	// truncation or extension to the final output type.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	// Constant inputs need to be replaced with the to-be-promoted nodes that
	// use them because they might have users outside of the cluster of
	// promoted nodes.
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	SDValue InSrc = Inputs[i].getOperand(0);
	if (Inputs[i].getValueType() == N->getValueType(0))
	DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
	else if (N->getOpcode() == ISD::SIGN_EXTEND)
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
	else
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
	}

	std::list<HandleSDNode> PromOpHandles;
	for (auto &PromOp : PromOps)
	PromOpHandles.emplace_back(PromOp);

	// Replace all operations (these are all the same, but have a different
	// (promoted) return type). DAG.getNode will validate that the types of
	// a binary operator match, so go through the list in reverse so that
	// we've likely promoted both operands first.
	while (!PromOpHandles.empty()) {
	SDValue PromOp = PromOpHandles.back().getValue();
	PromOpHandles.pop_back();

	unsigned C;
	switch (PromOp.getOpcode()) {
	default: C = 0; break;
	case ISD::SELECT: C = 1; break;
	case ISD::SELECT_CC: C = 2; break;
	}

	if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
	PromOp.getOperand(C).getValueType() != N->getValueType(0)) \|\|
	(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
	PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
	// The to-be-promoted operands of this node have not yet been
	// promoted (this should be rare because we're going through the
	// list backward, but if one of the operands has several users in
	// this cluster of to-be-promoted nodes, it is possible).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	// For SELECT and SELECT_CC nodes, we do a similar check for any
	// to-be-promoted comparison inputs.
	if (PromOp.getOpcode() == ISD::SELECT \|\|
	PromOp.getOpcode() == ISD::SELECT_CC) {
	if ((SelectTruncOp[0].count(PromOp.getNode()) &&
	PromOp.getOperand(0).getValueType() != N->getValueType(0)) \|\|
	(SelectTruncOp[1].count(PromOp.getNode()) &&
	PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
	PromOpHandles.emplace_front(PromOp);
	continue;
	}
	}

	SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
	PromOp.getNode()->op_end());

	// If this node has constant inputs, then they'll need to be promoted here.
	for (unsigned i = 0; i < 2; ++i) {
	if (!isa<ConstantSDNode>(Ops[C+i]))
	continue;
	if (Ops[C+i].getValueType() == N->getValueType(0))
	continue;

	if (N->getOpcode() == ISD::SIGN_EXTEND)
	Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	else
	Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	}

	// If we've promoted the comparison inputs of a SELECT or SELECT_CC,
	// truncate them again to the original value type.
	if (PromOp.getOpcode() == ISD::SELECT \|\|
	PromOp.getOpcode() == ISD::SELECT_CC) {
	auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
	if (SI0 != SelectTruncOp[0].end())
	Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
	auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
	if (SI1 != SelectTruncOp[1].end())
	Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
	}

	DAG.ReplaceAllUsesOfValueWith(PromOp,
	DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
	}

	// Now we're left with the initial extension itself.
	if (!ReallyNeedsExt)
	return N->getOperand(0);

	// To zero extend, just mask off everything except for the first bit (in the
	// i1 case).
	if (N->getOpcode() == ISD::ZERO_EXTEND)
	return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
	DAG.getConstant(APInt::getLowBitsSet(
	N->getValueSizeInBits(0), PromBits),
	dl, N->getValueType(0)));

	assert(N->getOpcode() == ISD::SIGN_EXTEND &&
	"Invalid extension type");
	EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
	SDValue ShiftCst =
	DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
	return DAG.getNode(
	ISD::SRA, dl, N->getValueType(0),
	DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
	ShiftCst);
	}

	SDValue PPCTargetLowering::combineSetCC(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::SETCC &&
	"Should be called with a SETCC node");

	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse())
	std::swap(LHS, RHS);

	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDLoc DL(N);
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	}

	return DAGCombineTruncBoolExt(N, DCI);
	}

	// Is this an extending load from an f32 to an f64?
	static bool isFPExtLoad(SDValue Op) {
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
	return LD->getExtensionType() == ISD::EXTLOAD &&
	Op.getValueType() == MVT::f64;
	return false;
	}

	/// Reduces the number of fp-to-int conversion when building a vector.
	///
	/// If this vector is built out of floating to integer conversions,
	/// transform it to a vector built out of floating point values followed by a
	/// single floating to integer conversion of the vector.
	/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
	/// becomes (fptosi (build_vector ($A, $B, ...)))
	SDValue PPCTargetLowering::
	combineElementTruncationToVectorTruncation(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	SDValue FirstInput = N->getOperand(0);
	assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
	"The input operand must be an fp-to-int conversion.");

	// This combine happens after legalization so the fp_to_[su]i nodes are
	// already converted to PPCSISD nodes.
	unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
	if (FirstConversion == PPCISD::FCTIDZ \|\|
	FirstConversion == PPCISD::FCTIDUZ \|\|
	FirstConversion == PPCISD::FCTIWZ \|\|
	FirstConversion == PPCISD::FCTIWUZ) {
	bool IsSplat = true;
	bool Is32Bit = FirstConversion == PPCISD::FCTIWZ \|\|
	FirstConversion == PPCISD::FCTIWUZ;
	EVT SrcVT = FirstInput.getOperand(0).getValueType();
	SmallVector<SDValue, 4> Ops;
	EVT TargetVT = N->getValueType(0);
	for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue NextOp = N->getOperand(i);
	if (NextOp.getOpcode() != PPCISD::MFVSR)
	return SDValue();
	unsigned NextConversion = NextOp.getOperand(0).getOpcode();
	if (NextConversion != FirstConversion)
	return SDValue();
	// If we are converting to 32-bit integers, we need to add an FP_ROUND.
	// This is not valid if the input was originally double precision. It is
	// also not profitable to do unless this is an extending load in which
	// case doing this combine will allow us to combine consecutive loads.
	if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
	return SDValue();
	if (N->getOperand(i) != FirstInput)
	IsSplat = false;
	}

	// If this is a splat, we leave it as-is since there will be only a single
	// fp-to-int conversion followed by a splat of the integer. This is better
	// for 32-bit and smaller ints and neutral for 64-bit ints.
	if (IsSplat)
	return SDValue();

	// Now that we know we have the right type of node, get its operands
	for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue In = N->getOperand(i).getOperand(0);
	if (Is32Bit) {
	// For 32-bit values, we need to add an FP_ROUND node (if we made it
	// here, we know that all inputs are extending loads so this is safe).
	if (In.isUndef())
	Ops.push_back(DAG.getUNDEF(SrcVT));
	else {
	SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, In.getOperand(0),
	DAG.getIntPtrConstant(1, dl));
	Ops.push_back(Trunc);
	}
	} else
	Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
	}

	unsigned Opcode;
	if (FirstConversion == PPCISD::FCTIDZ \|\|
	FirstConversion == PPCISD::FCTIWZ)
	Opcode = ISD::FP_TO_SINT;
	else
	Opcode = ISD::FP_TO_UINT;

	EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
	SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
	return DAG.getNode(Opcode, dl, TargetVT, BV);
	}
	return SDValue();
	}

	/// Reduce the number of loads when building a vector.
	///
	/// Building a vector out of multiple loads can be converted to a load
	/// of the vector type if the loads are consecutive. If the loads are
	/// consecutive but in descending order, a shuffle is added at the end
	/// to reorder the vector.
	static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SDLoc dl(N);

	// Return early for non byte-sized type, as they can't be consecutive.
	if (!N->getValueType(0).getVectorElementType().isByteSized())
	return SDValue();

	bool InputsAreConsecutiveLoads = true;
	bool InputsAreReverseConsecutive = true;
	unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
	SDValue FirstInput = N->getOperand(0);
	bool IsRoundOfExtLoad = false;

	if (FirstInput.getOpcode() == ISD::FP_ROUND &&
	FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
	LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
	IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
	}
	// Not a build vector of (possibly fp_rounded) loads.
	if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) \|\|
	N->getNumOperands() == 1)
	return SDValue();

	for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
	// If any inputs are fp_round(extload), they all must be.
	if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
	return SDValue();

	SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
	N->getOperand(i);
	if (NextInput.getOpcode() != ISD::LOAD)
	return SDValue();

	SDValue PreviousInput =
	IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
	LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);

	// If any inputs are fp_round(extload), they all must be.
	if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
	return SDValue();

	if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
	InputsAreConsecutiveLoads = false;
	if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
	InputsAreReverseConsecutive = false;

	// Exit early if the loads are neither consecutive nor reverse consecutive.
	if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
	return SDValue();
	}

	assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
	"The loads cannot be both consecutive and reverse consecutive.");

	SDValue FirstLoadOp =
	IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
	SDValue LastLoadOp =
	IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
	N->getOperand(N->getNumOperands()-1);

	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
	LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
	if (InputsAreConsecutiveLoads) {
	assert(LD1 && "Input needs to be a LoadSDNode.");
	return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
	LD1->getBasePtr(), LD1->getPointerInfo(),
	LD1->getAlignment());
	}
	if (InputsAreReverseConsecutive) {
	assert(LDL && "Input needs to be a LoadSDNode.");
	SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
	LDL->getBasePtr(), LDL->getPointerInfo(),
	LDL->getAlignment());
	SmallVector<int, 16> Ops;
	for (int i = N->getNumOperands() - 1; i >= 0; i--)
	Ops.push_back(i);

	return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
	DAG.getUNDEF(N->getValueType(0)), Ops);
	}
	return SDValue();
	}

	// This function adds the required vector_shuffle needed to get
	// the elements of the vector extract in the correct position
	// as specified by the CorrectElems encoding.
	static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
	SDValue Input, uint64_t Elems,
	uint64_t CorrectElems) {
	SDLoc dl(N);

	unsigned NumElems = Input.getValueType().getVectorNumElements();
	SmallVector<int, 16> ShuffleMask(NumElems, -1);

	// Knowing the element indices being extracted from the original
	// vector and the order in which they're being inserted, just put
	// them at element indices required for the instruction.
	for (unsigned i = 0; i < N->getNumOperands(); i++) {
	if (DAG.getDataLayout().isLittleEndian())
	ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
	else
	ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
	CorrectElems = CorrectElems >> 8;
	Elems = Elems >> 8;
	}

	SDValue Shuffle =
	DAG.getVectorShuffle(Input.getValueType(), dl, Input,
	DAG.getUNDEF(Input.getValueType()), ShuffleMask);

	EVT Ty = N->getValueType(0);
	SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
	return BV;
	}

	// Look for build vector patterns where input operands come from sign
	// extended vector_extract elements of specific indices. If the correct indices
	// aren't used, add a vector shuffle to fix up the indices and create a new
	// PPCISD:SExtVElems node which selects the vector sign extend instructions
	// during instruction selection.
	static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
	// This array encodes the indices that the vector sign extend instructions
	// extract from when extending from one type to another for both BE and LE.
	// The right nibble of each byte corresponds to the LE incides.
	// and the left nibble of each byte corresponds to the BE incides.
	// For example: 0x3074B8FC byte->word
	// For LE: the allowed indices are: 0x0,0x4,0x8,0xC
	// For BE: the allowed indices are: 0x3,0x7,0xB,0xF
	// For example: 0x000070F8 byte->double word
	// For LE: the allowed indices are: 0x0,0x8
	// For BE: the allowed indices are: 0x7,0xF
	uint64_t TargetElems[] = {
	0x3074B8FC, // b->w
	0x000070F8, // b->d
	0x10325476, // h->w
	0x00003074, // h->d
	0x00001032, // w->d
	};

	uint64_t Elems = 0;
	int Index;
	SDValue Input;

	auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
	if (!Op)
	return false;
	if (Op.getOpcode() != ISD::SIGN_EXTEND &&
	Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
	return false;

	// A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
	// of the right width.
	SDValue Extract = Op.getOperand(0);
	if (Extract.getOpcode() == ISD::ANY_EXTEND)
	Extract = Extract.getOperand(0);
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
	if (!ExtOp)
	return false;

	Index = ExtOp->getZExtValue();
	if (Input && Input != Extract.getOperand(0))
	return false;

	if (!Input)
	Input = Extract.getOperand(0);

	Elems = Elems << 8;
	Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
	Elems \|= Index;

	return true;
	};

	// If the build vector operands aren't sign extended vector extracts,
	// of the same input vector, then return.
	for (unsigned i = 0; i < N->getNumOperands(); i++) {
	if (!isSExtOfVecExtract(N->getOperand(i))) {
	return SDValue();
	}
	}

	// If the vector extract indicies are not correct, add the appropriate
	// vector_shuffle.
	int TgtElemArrayIdx;
	int InputSize = Input.getValueType().getScalarSizeInBits();
	int OutputSize = N->getValueType(0).getScalarSizeInBits();
	if (InputSize + OutputSize == 40)
	TgtElemArrayIdx = 0;
	else if (InputSize + OutputSize == 72)
	TgtElemArrayIdx = 1;
	else if (InputSize + OutputSize == 48)
	TgtElemArrayIdx = 2;
	else if (InputSize + OutputSize == 80)
	TgtElemArrayIdx = 3;
	else if (InputSize + OutputSize == 96)
	TgtElemArrayIdx = 4;
	else
	return SDValue();

	uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
	CorrectElems = DAG.getDataLayout().isLittleEndian()
	? CorrectElems & 0x0F0F0F0F0F0F0F0F
	: CorrectElems & 0xF0F0F0F0F0F0F0F0;
	if (Elems != CorrectElems) {
	return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
	}

	// Regular lowering will catch cases where a shuffle is not needed.
	return SDValue();
	}

	SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	if (!Subtarget.hasVSX())
	return SDValue();

	// The target independent DAG combiner will leave a build_vector of
	// float-to-int conversions intact. We can generate MUCH better code for
	// a float-to-int conversion of a vector of floats.
	SDValue FirstInput = N->getOperand(0);
	if (FirstInput.getOpcode() == PPCISD::MFVSR) {
	SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
	if (Reduced)
	return Reduced;
	}

	// If we're building a vector out of consecutive loads, just load that
	// vector type.
	SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
	if (Reduced)
	return Reduced;

	// If we're building a vector out of extended elements from another vector
	// we have P9 vector integer extend instructions. The code assumes legal
	// input types (i.e. it can't handle things like v4i16) so do not run before
	// legalization.
	if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
	Reduced = combineBVOfVecSExt(N, DAG);
	if (Reduced)
	return Reduced;
	}


	if (N->getValueType(0) != MVT::v2f64)
	return SDValue();

	// Looking for:
	// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
	if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
	FirstInput.getOpcode() != ISD::UINT_TO_FP)
	return SDValue();
	if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
	N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
	return SDValue();
	if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
	return SDValue();

	SDValue Ext1 = FirstInput.getOperand(0);
	SDValue Ext2 = N->getOperand(1).getOperand(0);
	if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
	ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
	if (!Ext1Op \|\| !Ext2Op)
	return SDValue();
	if (Ext1.getOperand(0).getValueType() != MVT::v4i32 \|\|
	Ext1.getOperand(0) != Ext2.getOperand(0))
	return SDValue();

	int FirstElem = Ext1Op->getZExtValue();
	int SecondElem = Ext2Op->getZExtValue();
	int SubvecIdx;
	if (FirstElem == 0 && SecondElem == 1)
	SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
	else if (FirstElem == 2 && SecondElem == 3)
	SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
	else
	return SDValue();

	SDValue SrcVec = Ext1.getOperand(0);
	auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
	PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
	return DAG.getNode(NodeType, dl, MVT::v2f64,
	SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
	}

	SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::SINT_TO_FP \|\|
	N->getOpcode() == ISD::UINT_TO_FP) &&
	"Need an int -> FP conversion node here");

	if (useSoftFloat() \|\| !Subtarget.has64BitSupport())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Op(N, 0);

	// Don't handle ppc_fp128 here or conversions that are out-of-range capable
	// from the hardware.
	if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
	return SDValue();
	if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) \|\|
	Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
	return SDValue();

	SDValue FirstOperand(Op.getOperand(0));
	bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
	(FirstOperand.getValueType() == MVT::i8 \|\|
	FirstOperand.getValueType() == MVT::i16);
	if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
	bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
	bool DstDouble = Op.getValueType() == MVT::f64;
	unsigned ConvOp = Signed ?
	(DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
	(DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
	SDValue WidthConst =
	DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
	dl, false);
	LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
	SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
	SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i8, LDN->getMemOperand());

	// For signed conversion, we need to sign-extend the value in the VSR
	if (Signed) {
	SDValue ExtOps[] = { Ld, WidthConst };
	SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
	return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
	} else
	return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
	}


	// For i32 intermediate values, unfortunately, the conversion functions
	// leave the upper 32 bits of the value are undefined. Within the set of
	// scalar instructions, we have no method for zero- or sign-extending the
	// value. Thus, we cannot handle i32 intermediate values here.
	if (Op.getOperand(0).getValueType() == MVT::i32)
	return SDValue();

	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\| Subtarget.hasFPCVT()) &&
	"UINT_TO_FP is supported only with FPCVT");

	// If we have FCFIDS, then use it when converting to single-precision.
	// Otherwise, convert to double-precision and then round.
	unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
	: PPCISD::FCFIDS)
	: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
	: PPCISD::FCFID);
	MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? MVT::f32
	: MVT::f64;

	// If we're converting from a float, to an int, and back to a float again,
	// then we don't need the store/load pair at all.
	if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
	Subtarget.hasFPCVT()) \|\|
	(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
	SDValue Src = Op.getOperand(0).getOperand(0);
	if (Src.getValueType() == MVT::f32) {
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
	DCI.AddToWorklist(Src.getNode());
	} else if (Src.getValueType() != MVT::f64) {
	// Make sure that we don't pick up a ppc_fp128 source value.
	return SDValue();
	}

	unsigned FCTOp =
	Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ;

	SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);

	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
	FP = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
	DCI.AddToWorklist(FP.getNode());
	}

	return FP;
	}

	return SDValue();
	}

	// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
	// builtins) into loads with swaps.
	SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Chain;
	SDValue Base;
	MachineMemOperand *MMO;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Unexpected opcode for little endian VSX load");
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	Chain = LD->getChain();
	Base = LD->getBasePtr();
	MMO = LD->getMemOperand();
	// If the MMO suggests this isn't a load of a full vector, leave
	// things alone. For a built-in, we have to make the change for
	// correctness, so if there is a size problem that will be a bug.
	if (MMO->getSize() < 16)
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
	Chain = Intrin->getChain();
	// Similarly to the store case below, Intrin->getBasePtr() doesn't get
	// us what we want. Get operand 2 instead.
	Base = Intrin->getOperand(2);
	MMO = Intrin->getMemOperand();
	break;
	}
	}

	MVT VecTy = N->getValueType(0).getSimpleVT();

	// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
	// aligned and the type is a vector with elements up to 4 bytes
	if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
	&& VecTy.getScalarSizeInBits() <= 32 ) {
	return SDValue();
	}

	SDValue LoadOps[] = { Chain, Base };
	SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
	DAG.getVTList(MVT::v2f64, MVT::Other),
	LoadOps, MVT::v2f64, MMO);

	DCI.AddToWorklist(Load.getNode());
	Chain = Load.getValue(1);
	SDValue Swap = DAG.getNode(
	PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
	DCI.AddToWorklist(Swap.getNode());

	// Add a bitcast if the resulting load type doesn't match v2f64.
	if (VecTy != MVT::v2f64) {
	SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
	DCI.AddToWorklist(N.getNode());
	// Package {bitcast value, swap's chain} to match Load's shape.
	return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
	N, Swap.getValue(1));
	}

	return Swap;
	}

	// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
	// builtins) into stores with swaps.
	SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Chain;
	SDValue Base;
	unsigned SrcOpnd;
	MachineMemOperand *MMO;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Unexpected opcode for little endian VSX store");
	case ISD::STORE: {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	Chain = ST->getChain();
	Base = ST->getBasePtr();
	MMO = ST->getMemOperand();
	SrcOpnd = 1;
	// If the MMO suggests this isn't a store of a full vector, leave
	// things alone. For a built-in, we have to make the change for
	// correctness, so if there is a size problem that will be a bug.
	if (MMO->getSize() < 16)
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_VOID: {
	MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
	Chain = Intrin->getChain();
	// Intrin->getBasePtr() oddly does not get what we want.
	Base = Intrin->getOperand(3);
	MMO = Intrin->getMemOperand();
	SrcOpnd = 2;
	break;
	}
	}

	SDValue Src = N->getOperand(SrcOpnd);
	MVT VecTy = Src.getValueType().getSimpleVT();

	// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
	// aligned and the type is a vector with elements up to 4 bytes
	if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
	&& VecTy.getScalarSizeInBits() <= 32 ) {
	return SDValue();
	}

	// All stores are done as v2f64 and possible bit cast.
	if (VecTy != MVT::v2f64) {
	Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
	DCI.AddToWorklist(Src.getNode());
	}

	SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
	DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
	DCI.AddToWorklist(Swap.getNode());
	Chain = Swap.getValue(1);
	SDValue StoreOps[] = { Chain, Swap, Base };
	SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
	DAG.getVTList(MVT::Other),
	StoreOps, VecTy, MMO);
	DCI.AddToWorklist(Store.getNode());
	return Store;
	}

	// Handle DAG combine for STORE (FP_TO_INT F).
	SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
	DAGCombinerInfo &DCI) const {

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	unsigned Opcode = N->getOperand(1).getOpcode();

	assert((Opcode == ISD::FP_TO_SINT \|\| Opcode == ISD::FP_TO_UINT)
	&& "Not a FP_TO_INT Instruction!");

	SDValue Val = N->getOperand(1).getOperand(0);
	EVT Op1VT = N->getOperand(1).getValueType();
	EVT ResVT = Val.getValueType();

	// Floating point types smaller than 32 bits are not legal on Power.
	if (ResVT.getScalarSizeInBits() < 32)
	return SDValue();

	// Only perform combine for conversion to i64/i32 or power9 i16/i8.
	bool ValidTypeForStoreFltAsInt =
	(Op1VT == MVT::i32 \|\| Op1VT == MVT::i64 \|\|
	(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 \|\| Op1VT == MVT::i8)));

	if (ResVT == MVT::ppcf128 \|\| !Subtarget.hasP8Vector() \|\|
	cast<StoreSDNode>(N)->isTruncatingStore() \|\| !ValidTypeForStoreFltAsInt)
	return SDValue();

	// Extend f32 values to f64
	if (ResVT.getScalarSizeInBits() == 32) {
	Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
	DCI.AddToWorklist(Val.getNode());
	}

	// Set signed or unsigned conversion opcode.
	unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
	PPCISD::FP_TO_SINT_IN_VSR :
	PPCISD::FP_TO_UINT_IN_VSR;

	Val = DAG.getNode(ConvOpcode,
	dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
	DCI.AddToWorklist(Val.getNode());

	// Set number of bytes being converted.
	unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
	SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
	DAG.getIntPtrConstant(ByteSize, dl, false),
	DAG.getValueType(Op1VT) };

	Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
	DAG.getVTList(MVT::Other), Ops,
	cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());

	DCI.AddToWorklist(Val.getNode());
	return Val;
	}

	SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
	LSBaseSDNode *LSBase,
	DAGCombinerInfo &DCI) const {
	assert((ISD::isNormalLoad(LSBase) \|\| ISD::isNormalStore(LSBase)) &&
	"Not a reverse memop pattern!");

	auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
	auto Mask = SVN->getMask();
	int i = 0;
	auto I = Mask.rbegin();
	auto E = Mask.rend();

	for (; I != E; ++I) {
	if (*I != i)
	return false;
	i++;
	}
	return true;
	};

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = SVN->getValueType(0);

	if (!isTypeLegal(VT) \|\| !Subtarget.isLittleEndian() \|\| !Subtarget.hasVSX())
	return SDValue();

	// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
	// See comment in PPCVSXSwapRemoval.cpp.
	// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
	if (!Subtarget.hasP9Vector())
	return SDValue();

	if(!IsElementReverse(SVN))
	return SDValue();

	if (LSBase->getOpcode() == ISD::LOAD) {
	SDLoc dl(SVN);
	SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
	return DAG.getMemIntrinsicNode(
	PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
	LSBase->getMemoryVT(), LSBase->getMemOperand());
	}

	if (LSBase->getOpcode() == ISD::STORE) {
	SDLoc dl(LSBase);
	SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
	LSBase->getBasePtr()};
	return DAG.getMemIntrinsicNode(
	PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
	LSBase->getMemoryVT(), LSBase->getMemOperand());
	}

	llvm_unreachable("Expected a load or store node here");
	}

	SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default: break;
	case ISD::ADD:
	return combineADD(N, DCI);
	case ISD::SHL:
	return combineSHL(N, DCI);
	case ISD::SRA:
	return combineSRA(N, DCI);
	case ISD::SRL:
	return combineSRL(N, DCI);
	case ISD::MUL:
	return combineMUL(N, DCI);
	case PPCISD::SHL:
	if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
	return N->getOperand(0);
	break;
	case PPCISD::SRL:
	if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
	return N->getOperand(0);
	break;
	case PPCISD::SRA:
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
	if (C->isNullValue() \|\| // 0 >>s V -> 0.
	C->isAllOnesValue()) // -1 >>s V -> -1.
	return N->getOperand(0);
	}
	break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	return DAGCombineExtBoolTrunc(N, DCI);
	case ISD::TRUNCATE:
	return combineTRUNCATE(N, DCI);
	case ISD::SETCC:
	if (SDValue CSCC = combineSetCC(N, DCI))
	return CSCC;
	LLVM_FALLTHROUGH;
	case ISD::SELECT_CC:
	return DAGCombineTruncBoolExt(N, DCI);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return combineFPToIntToFP(N, DCI);
	case ISD::VECTOR_SHUFFLE:
	if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
	LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
	return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
	}
	break;
	case ISD::STORE: {

	EVT Op1VT = N->getOperand(1).getValueType();
	unsigned Opcode = N->getOperand(1).getOpcode();

	if (Opcode == ISD::FP_TO_SINT \|\| Opcode == ISD::FP_TO_UINT) {
	SDValue Val= combineStoreFPToInt(N, DCI);
	if (Val)
	return Val;
	}

	if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
	SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
	if (Val)
	return Val;
	}

	// Turn STORE (BSWAP) -> sthbrx/stwbrx.
	if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
	N->getOperand(1).getNode()->hasOneUse() &&
	(Op1VT == MVT::i32 \|\| Op1VT == MVT::i16 \|\|
	(Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {

	// STBRX can only handle simple types and it makes no sense to store less
	// two bytes in byte-reversed order.
	EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
	if (mVT.isExtended() \|\| mVT.getSizeInBits() < 16)
	break;

	SDValue BSwapOp = N->getOperand(1).getOperand(0);
	// Do an any-extend to 32-bits if this is a half-word input.
	if (BSwapOp.getValueType() == MVT::i16)
	BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);

	// If the type of BSWAP operand is wider than stored memory width
	// it need to be shifted to the right side before STBRX.
	if (Op1VT.bitsGT(mVT)) {
	int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
	BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
	DAG.getConstant(Shift, dl, MVT::i32));
	// Need to truncate if this is a bswap of i64 stored as i32/i16.
	if (Op1VT == MVT::i64)
	BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
	}

	SDValue Ops[] = {
	N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
	};
	return
	DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
	Ops, cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());
	}

	// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
	// So it can increase the chance of CSE constant construction.
	if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
	isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
	// Need to sign-extended to 64-bits to handle negative values.
	EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
	uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
	MemVT.getSizeInBits());
	SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);

	// DAG.getTruncStore() can't be used here because it doesn't accept
	// the general (base + offset) addressing mode.
	// So we use UpdateNodeOperands and setTruncatingStore instead.
	DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
	N->getOperand(3));
	cast<StoreSDNode>(N)->setTruncatingStore(true);
	return SDValue(N, 0);
	}

	// For little endian, VSX stores require generating xxswapd/lxvd2x.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
	if (Op1VT.isSimple()) {
	MVT StoreVT = Op1VT.getSimpleVT();
	if (Subtarget.needsSwapsForVSXMemOps() &&
	(StoreVT == MVT::v2f64 \|\| StoreVT == MVT::v2i64 \|\|
	StoreVT == MVT::v4f32 \|\| StoreVT == MVT::v4i32))
	return expandVSXStoreForLE(N, DCI);
	}
	break;
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	EVT VT = LD->getValueType(0);

	// For little endian, VSX loads require generating lxvd2x/xxswapd.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
	if (VT.isSimple()) {
	MVT LoadVT = VT.getSimpleVT();
	if (Subtarget.needsSwapsForVSXMemOps() &&
	(LoadVT == MVT::v2f64 \|\| LoadVT == MVT::v2i64 \|\|
	LoadVT == MVT::v4f32 \|\| LoadVT == MVT::v4i32))
	return expandVSXLoadForLE(N, DCI);
	}

	// We sometimes end up with a 64-bit integer load, from which we extract
	// two single-precision floating-point numbers. This happens with
	// std::complex<float>, and other similar structures, because of the way we
	// canonicalize structure copies. However, if we lack direct moves,
	// then the final bitcasts from the extracted integer values to the
	// floating-point numbers turn into store/load pairs. Even with direct moves,
	// just loading the two floating-point numbers is likely better.
	auto ReplaceTwoFloatLoad = [&]() {
	if (VT != MVT::i64)
	return false;

	if (LD->getExtensionType() != ISD::NON_EXTLOAD \|\|
	LD->isVolatile())
	return false;

	// We're looking for a sequence like this:
	// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
	// t16: i64 = srl t13, Constant:i32<32>
	// t17: i32 = truncate t16
	// t18: f32 = bitcast t17
	// t19: i32 = truncate t13
	// t20: f32 = bitcast t19

	if (!LD->hasNUsesOfValue(2, 0))
	return false;

	auto UI = LD->use_begin();
	while (UI.getUse().getResNo() != 0) ++UI;
	SDNode Trunc = UI++;
	while (UI.getUse().getResNo() != 0) ++UI;
	SDNode RightShift = UI;
	if (Trunc->getOpcode() != ISD::TRUNCATE)
	std::swap(Trunc, RightShift);

	if (Trunc->getOpcode() != ISD::TRUNCATE \|\|
	Trunc->getValueType(0) != MVT::i32 \|\|
	!Trunc->hasOneUse())
	return false;
	if (RightShift->getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(RightShift->getOperand(1)) \|\|
	RightShift->getConstantOperandVal(1) != 32 \|\|
	!RightShift->hasOneUse())
	return false;

	SDNode Trunc2 = RightShift->use_begin();
	if (Trunc2->getOpcode() != ISD::TRUNCATE \|\|
	Trunc2->getValueType(0) != MVT::i32 \|\|
	!Trunc2->hasOneUse())
	return false;

	SDNode Bitcast = Trunc->use_begin();
	SDNode Bitcast2 = Trunc2->use_begin();

	if (Bitcast->getOpcode() != ISD::BITCAST \|\|
	Bitcast->getValueType(0) != MVT::f32)
	return false;
	if (Bitcast2->getOpcode() != ISD::BITCAST \|\|
	Bitcast2->getValueType(0) != MVT::f32)
	return false;

	if (Subtarget.isLittleEndian())
	std::swap(Bitcast, Bitcast2);

	// Bitcast has the second float (in memory-layout order) and Bitcast2
	// has the first one.

	SDValue BasePtr = LD->getBasePtr();
	if (LD->isIndexed()) {
	assert(LD->getAddressingMode() == ISD::PRE_INC &&
	"Non-pre-inc AM on PPC?");
	BasePtr =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	LD->getOffset());
	}

	auto MMOFlags =
	LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
	SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
	LD->getPointerInfo(), LD->getAlignment(),
	MMOFlags, LD->getAAInfo());
	SDValue AddPtr =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
	BasePtr, DAG.getIntPtrConstant(4, dl));
	SDValue FloatLoad2 = DAG.getLoad(
	MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
	LD->getPointerInfo().getWithOffset(4),
	MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());

	if (LD->isIndexed()) {
	// Note that DAGCombine should re-form any pre-increment load(s) from
	// what is produced here if that makes sense.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
	}

	DCI.CombineTo(Bitcast2, FloatLoad);
	DCI.CombineTo(Bitcast, FloatLoad2);

	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
	SDValue(FloatLoad2.getNode(), 1));
	return true;
	};

	if (ReplaceTwoFloatLoad())
	return SDValue(N, 0);

	EVT MemVT = LD->getMemoryVT();
	Type Ty = MemVT.getTypeForEVT(DAG.getContext());
	unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
	Type STy = MemVT.getScalarType().getTypeForEVT(DAG.getContext());
	unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
	if (LD->isUnindexed() && VT.isVector() &&
	((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
	// P8 and later hardware should just use LOAD.
	!Subtarget.hasP8Vector() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasQPX() && (VT == MVT::v4f64 \|\| VT == MVT::v4f32) &&
	LD->getAlignment() >= ScalarABIAlignment)) &&
	LD->getAlignment() < ABIAlignment) {
	// This is a type-legal unaligned Altivec or QPX load.
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();
	bool isLittleEndian = Subtarget.isLittleEndian();

	// This implements the loading of unaligned vectors as described in
	// the venerable Apple Velocity Engine overview. Specifically:
	// https://developer.apple.com/hardwaredrivers/ve/alignment.html
	// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
	//
	// The general idea is to expand a sequence of one or more unaligned
	// loads into an alignment-based permutation-control instruction (lvsl
	// or lvsr), a series of regular vector loads (which always truncate
	// their input address to an aligned address), and a series of
	// permutations. The results of these permutations are the requested
	// loaded values. The trick is that the last "extra" load is not taken
	// from the address you might suspect (sizeof(vector) bytes after the
	// last requested load), but rather sizeof(vector) - 1 bytes after the
	// last requested vector. The point of this is to avoid a page fault if
	// the base address happened to be aligned. This works because if the
	// base address is aligned, then adding less than a full vector length
	// will cause the last vector in the sequence to be (re)loaded.
	// Otherwise, the next vector will be fetched as you might suspect was
	// necessary.

	// We might be able to reuse the permutation generation from
	// a different base address offset from this one by an aligned amount.
	// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
	// optimization later.
	Intrinsic::ID Intr, IntrLD, IntrPerm;
	MVT PermCntlTy, PermTy, LDTy;
	if (Subtarget.hasAltivec()) {
	Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
	Intrinsic::ppc_altivec_lvsl;
	IntrLD = Intrinsic::ppc_altivec_lvx;
	IntrPerm = Intrinsic::ppc_altivec_vperm;
	PermCntlTy = MVT::v16i8;
	PermTy = MVT::v4i32;
	LDTy = MVT::v4i32;
	} else {
	Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
	Intrinsic::ppc_qpx_qvlpcls;
	IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
	Intrinsic::ppc_qpx_qvlfs;
	IntrPerm = Intrinsic::ppc_qpx_qvfperm;
	PermCntlTy = MVT::v4f64;
	PermTy = MVT::v4f64;
	LDTy = MemVT.getSimpleVT();
	}

	SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);

	// Create the new MMO for the new base load. It is like the original MMO,
	// but represents an area in memory almost twice the vector size centered
	// on the original address. If the address is unaligned, we might start
	// reading up to (sizeof(vector)-1) bytes below the address of the
	// original unaligned load.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *BaseMMO =
	MF.getMachineMemOperand(LD->getMemOperand(),
	-(long)MemVT.getStoreSize()+1,
	2*MemVT.getStoreSize()-1);

	// Create the new base load.
	SDValue LDXIntID =
	DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
	SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
	SDValue BaseLoad =
	DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
	DAG.getVTList(PermTy, MVT::Other),
	BaseLoadOps, LDTy, BaseMMO);

	// Note that the value of IncOffset (which is provided to the next
	// load's pointer info offset value, and thus used to calculate the
	// alignment), and the value of IncValue (which is actually used to
	// increment the pointer value) are different! This is because we
	// require the next load to appear to be aligned, even though it
	// is actually offset from the base pointer by a lesser amount.
	int IncOffset = VT.getSizeInBits() / 8;
	int IncValue = IncOffset;

	// Walk (both up and down) the chain looking for another load at the real
	// (aligned) offset (the alignment of the other load does not matter in
	// this case). If found, then do not use the offset reduction trick, as
	// that will prevent the loads from being later combined (as they would
	// otherwise be duplicates).
	if (!findConsecutiveLoad(LD, DAG))
	--IncValue;

	SDValue Increment =
	DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

	MachineMemOperand *ExtraMMO =
	MF.getMachineMemOperand(LD->getMemOperand(),
	1, 2*MemVT.getStoreSize()-1);
	SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
	SDValue ExtraLoad =
	DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
	DAG.getVTList(PermTy, MVT::Other),
	ExtraLoadOps, LDTy, ExtraMMO);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	BaseLoad.getValue(1), ExtraLoad.getValue(1));

	// Because vperm has a big-endian bias, we must reverse the order
	// of the input vectors and complement the permute control vector
	// when generating little endian code. We have already handled the
	// latter by using lvsr instead of lvsl, so just reverse BaseLoad
	// and ExtraLoad here.
	SDValue Perm;
	if (isLittleEndian)
	Perm = BuildIntrinsicOp(IntrPerm,
	ExtraLoad, BaseLoad, PermCntl, DAG, dl);
	else
	Perm = BuildIntrinsicOp(IntrPerm,
	BaseLoad, ExtraLoad, PermCntl, DAG, dl);

	if (VT != PermTy)
	Perm = Subtarget.hasAltivec() ?
	DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
	DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
	DAG.getTargetConstant(1, dl, MVT::i64));
	// second argument is 1 because this rounding
	// is always exact.

	// The output of the permutation is our loaded result, the TokenFactor is
	// our new chain.
	DCI.CombineTo(N, Perm, TF);
	return SDValue(N, 0);
	}
	}
	break;
	case ISD::INTRINSIC_WO_CHAIN: {
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
	: Intrinsic::ppc_altivec_lvsl);
	if ((IID == Intr \|\|
	IID == Intrinsic::ppc_qpx_qvlpcld \|\|
	IID == Intrinsic::ppc_qpx_qvlpcls) &&
	N->getOperand(1)->getOpcode() == ISD::ADD) {
	SDValue Add = N->getOperand(1);

	int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
	5 /* 32 byte alignment / : 4 / 16 byte alignment */;

	if (DAG.MaskedValueIsZero(Add->getOperand(1),
	APInt::getAllOnesValue(Bits /* alignment */)
	.zext(Add.getScalarValueSizeInBits()))) {
	SDNode *BasePtr = Add->getOperand(0).getNode();
	for (SDNode::use_iterator UI = BasePtr->use_begin(),
	UE = BasePtr->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
	// We've found another LVSL/LVSR, and this address is an aligned
	// multiple of that one. The results will be the same, so use the
	// one we've just found instead.

	return SDValue(*UI, 0);
	}
	}
	}

	if (isa<ConstantSDNode>(Add->getOperand(1))) {
	SDNode *BasePtr = Add->getOperand(0).getNode();
	for (SDNode::use_iterator UI = BasePtr->use_begin(),
	UE = BasePtr->use_end(); UI != UE; ++UI) {
	if (UI->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(UI->getOperand(1)) &&
	(cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
	cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
	(1ULL << Bits) == 0) {
	SDNode OtherAdd = UI;
	for (SDNode::use_iterator VI = OtherAdd->use_begin(),
	VE = OtherAdd->use_end(); VI != VE; ++VI) {
	if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
	return SDValue(*VI, 0);
	}
	}
	}
	}
	}
	}

	// Combine vmaxsw/h/b(a, a's negation) to abs(a)
	// Expose the vabsduw/h/b opportunity for down stream
	if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
	(IID == Intrinsic::ppc_altivec_vmaxsw \|\|
	IID == Intrinsic::ppc_altivec_vmaxsh \|\|
	IID == Intrinsic::ppc_altivec_vmaxsb)) {
	SDValue V1 = N->getOperand(1);
	SDValue V2 = N->getOperand(2);
	if ((V1.getSimpleValueType() == MVT::v4i32 \|\|
	V1.getSimpleValueType() == MVT::v8i16 \|\|
	V1.getSimpleValueType() == MVT::v16i8) &&
	V1.getSimpleValueType() == V2.getSimpleValueType()) {
	// (0-a, a)
	if (V1.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
	V1.getOperand(1) == V2) {
	return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
	}
	// (a, 0-a)
	if (V2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
	V2.getOperand(1) == V1) {
	return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
	}
	// (x-y, y-x)
	if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
	V1.getOperand(0) == V2.getOperand(1) &&
	V1.getOperand(1) == V2.getOperand(0)) {
	return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
	}
	}
	}
	}

	break;
	case ISD::INTRINSIC_W_CHAIN:
	// For little endian, VSX loads require generating lxvd2x/xxswapd.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
	if (Subtarget.needsSwapsForVSXMemOps()) {
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default:
	break;
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvd2x:
	return expandVSXLoadForLE(N, DCI);
	}
	}
	break;
	case ISD::INTRINSIC_VOID:
	// For little endian, VSX stores require generating xxswapd/stxvd2x.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
	if (Subtarget.needsSwapsForVSXMemOps()) {
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default:
	break;
	case Intrinsic::ppc_vsx_stxvw4x:
	case Intrinsic::ppc_vsx_stxvd2x:
	return expandVSXStoreForLE(N, DCI);
	}
	}
	break;
	case ISD::BSWAP:
	// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
	if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
	N->getOperand(0).hasOneUse() &&
	(N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i16 \|\|
	(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
	N->getValueType(0) == MVT::i64))) {
	SDValue Load = N->getOperand(0);
	LoadSDNode *LD = cast<LoadSDNode>(Load);
	// Create the byte-swapping load.
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr(), // Ptr
	DAG.getValueType(N->getValueType(0)) // VT
	};
	SDValue BSLoad =
	DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
	DAG.getVTList(N->getValueType(0) == MVT::i64 ?
	MVT::i64 : MVT::i32, MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());

	// If this is an i16 load, insert the truncate.
	SDValue ResVal = BSLoad;
	if (N->getValueType(0) == MVT::i16)
	ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);

	// First, combine the bswap away. This makes the value produced by the
	// load dead.
	DCI.CombineTo(N, ResVal);

	// Next, combine the load away, we give it a bogus result value but a real
	// chain result. The result value is dead because the bswap is dead.
	DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));

	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	break;
	case PPCISD::VCMP:
	// If a VCMPo node already exists with exactly the same operands as this
	// node, use its result instead of this node (VCMPo computes both a CR6 and
	// a normal output).
	//
	if (!N->getOperand(0).hasOneUse() &&
	!N->getOperand(1).hasOneUse() &&
	!N->getOperand(2).hasOneUse()) {

	// Scan all of the users of the LHS, looking for VCMPo's that match.
	SDNode *VCMPoNode = nullptr;

	SDNode *LHSN = N->getOperand(0).getNode();
	for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
	UI != E; ++UI)
	if (UI->getOpcode() == PPCISD::VCMPo &&
	UI->getOperand(1) == N->getOperand(1) &&
	UI->getOperand(2) == N->getOperand(2) &&
	UI->getOperand(0) == N->getOperand(0)) {
	VCMPoNode = *UI;
	break;
	}

	// If there is no VCMPo node, or if the flag value has a single use, don't
	// transform this.
	if (!VCMPoNode \|\| VCMPoNode->hasNUsesOfValue(0, 1))
	break;

	// Look at the (necessarily single) use of the flag value. If it has a
	// chain, this transformation is more complex. Note that multiple things
	// could use the value result, which we should ignore.
	SDNode *FlagUser = nullptr;
	for (SDNode::use_iterator UI = VCMPoNode->use_begin();
	FlagUser == nullptr; ++UI) {
	assert(UI != VCMPoNode->use_end() && "Didn't find user!");
	SDNode User = UI;
	for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
	if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
	FlagUser = User;
	break;
	}
	}
	}

	// If the user is a MFOCRF instruction, we know this is safe.
	// Otherwise we give up for right now.
	if (FlagUser->getOpcode() == PPCISD::MFOCRF)
	return SDValue(VCMPoNode, 0);
	}
	break;
	case ISD::BRCOND: {
	SDValue Cond = N->getOperand(1);
	SDValue Target = N->getOperand(2);

	if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
	Intrinsic::loop_decrement) {

	// We now need to make the intrinsic dead (it cannot be instruction
	// selected).
	DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
	assert(Cond.getNode()->hasOneUse() &&
	"Counter decrement has more than one use");

	return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
	N->getOperand(0), Target);
	}
	}
	break;
	case ISD::BR_CC: {
	// If this is a branch on an altivec predicate comparison, lower this so
	// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
	// lowering is done pre-legalize, because the legalizer lowers the predicate
	// compare down to code that is difficult to reassemble.
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
	SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);

	// Sometimes the promoted value of the intrinsic is ANDed by some non-zero
	// value. If so, pass-through the AND to get to the intrinsic.
	if (LHS.getOpcode() == ISD::AND &&
	LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
	Intrinsic::loop_decrement &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	!isNullConstant(LHS.getOperand(1)))
	LHS = LHS.getOperand(0);

	if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
	Intrinsic::loop_decrement &&
	isa<ConstantSDNode>(RHS)) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	"Counter decrement comparison is not EQ or NE");

	unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
	bool isBDNZ = (CC == ISD::SETEQ && Val) \|\|
	(CC == ISD::SETNE && !Val);

	// We now need to make the intrinsic dead (it cannot be instruction
	// selected).
	DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
	assert(LHS.getNode()->hasOneUse() &&
	"Counter decrement has more than one use");

	return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
	N->getOperand(0), N->getOperand(4));
	}

	int CompareOpc;
	bool isDot;

	if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
	assert(isDot && "Can't compare against a vector result!");

	// If this is a comparison against something other than 0/1, then we know
	// that the condition is never/always true.
	unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (Val != 0 && Val != 1) {
	if (CC == ISD::SETEQ) // Cond never true, remove branch.
	return N->getOperand(0);
	// Always !=, turn it into an unconditional branch.
	return DAG.getNode(ISD::BR, dl, MVT::Other,
	N->getOperand(0), N->getOperand(4));
	}

	bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);

	// Create the PPCISD altivec 'dot' comparison node.
	SDValue Ops[] = {
	LHS.getOperand(2), // LHS of compare
	LHS.getOperand(3), // RHS of compare
	DAG.getConstant(CompareOpc, dl, MVT::i32)
	};
	EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
	SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);

	// Unpack the result based on how the target uses it.
	PPC::Predicate CompOpc;
	switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
	default: // Can't happen, don't crash on invalid number though.
	case 0: // Branch on the value of the EQ bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
	break;
	case 1: // Branch on the inverted value of the EQ bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
	break;
	case 2: // Branch on the value of the LT bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
	break;
	case 3: // Branch on the inverted value of the LT bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
	break;
	}

	return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
	DAG.getConstant(CompOpc, dl, MVT::i32),
	DAG.getRegister(PPC::CR6, MVT::i32),
	N->getOperand(4), CompNode.getValue(1));
	}
	break;
	}
	case ISD::BUILD_VECTOR:
	return DAGCombineBuildVector(N, DCI);
	case ISD::ABS:
	return combineABS(N, DCI);
	case ISD::VSELECT:
	return combineVSelect(N, DCI);
	}

	return SDValue();
	}

	SDValue
	PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if (VT == MVT::i64 && !Subtarget.isPPC64())
	return SDValue();
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);

	bool IsNegPow2 = (-Divisor).isPowerOf2();
	unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
	SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);

	SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
	Created.push_back(Op.getNode());

	if (IsNegPow2) {
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	Created.push_back(Op.getNode());
	}

	return Op;
	}

	//===----------------------------------------------------------------------===//
	// Inline Assembly Support
	//===----------------------------------------------------------------------===//

	void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	Known.resetAll();
	switch (Op.getOpcode()) {
	default: break;
	case PPCISD::LBRX: {
	// lhbrx is known to have the top bits cleared out.
	if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
	Known.Zero = 0xFFFF0000;
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
	default: break;
	case Intrinsic::ppc_altivec_vcmpbfp_p:
	case Intrinsic::ppc_altivec_vcmpeqfp_p:
	case Intrinsic::ppc_altivec_vcmpequb_p:
	case Intrinsic::ppc_altivec_vcmpequh_p:
	case Intrinsic::ppc_altivec_vcmpequw_p:
	case Intrinsic::ppc_altivec_vcmpequd_p:
	case Intrinsic::ppc_altivec_vcmpgefp_p:
	case Intrinsic::ppc_altivec_vcmpgtfp_p:
	case Intrinsic::ppc_altivec_vcmpgtsb_p:
	case Intrinsic::ppc_altivec_vcmpgtsh_p:
	case Intrinsic::ppc_altivec_vcmpgtsw_p:
	case Intrinsic::ppc_altivec_vcmpgtsd_p:
	case Intrinsic::ppc_altivec_vcmpgtub_p:
	case Intrinsic::ppc_altivec_vcmpgtuh_p:
	case Intrinsic::ppc_altivec_vcmpgtuw_p:
	case Intrinsic::ppc_altivec_vcmpgtud_p:
	Known.Zero = ~1U; // All bits but the low one are known to be zero.
	break;
	}
	}
	}
	}

	Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
	switch (Subtarget.getCPUDirective()) {
	default: break;
	case PPC::DIR_970:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	case PPC::DIR_PWR9:
	case PPC::DIR_PWR_FUTURE: {
	if (!ML)
	break;

	if (!DisableInnermostLoopAlign32) {
	// If the nested loop is an innermost loop, prefer to a 32-byte alignment,
	// so that we can decrease cache misses and branch-prediction misses.
	// Actual alignment of the loop will depend on the hotness check and other
	// logic in alignBlocks.
	if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
	return Align(32);
	}

	const PPCInstrInfo *TII = Subtarget.getInstrInfo();

	// For small loops (between 5 and 8 instructions), align to a 32-byte
	// boundary so that the entire loop fits in one instruction-cache line.
	uint64_t LoopSize = 0;
	for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
	for (auto J = (I)->begin(), JE = (I)->end(); J != JE; ++J) {
	LoopSize += TII->getInstSizeInBytes(*J);
	if (LoopSize > 32)
	break;
	}

	if (LoopSize > 16 && LoopSize <= 32)
	return Align(32);

	break;
	}
	}

	return TargetLowering::getPrefLoopAlignment(ML);
	}

	/// getConstraintType - Given a constraint, return the type of
	/// constraint it is for this target.
	PPCTargetLowering::ConstraintType
	PPCTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default: break;
	case 'b':
	case 'r':
	case 'f':
	case 'd':
	case 'v':
	case 'y':
	return C_RegisterClass;
	case 'Z':
	// FIXME: While Z does indicate a memory constraint, it specifically
	// indicates an r+r address (used in conjunction with the 'y' modifier
	// in the replacement string). Currently, we're forcing the base
	// register to be r0 in the asm printer (which is interpreted as zero)
	// and forming the complete address in the second register. This is
	// suboptimal.
	return C_Memory;
	}
	} else if (Constraint == "wc") { // individual CR bits.
	return C_RegisterClass;
	} else if (Constraint == "wa" \|\| Constraint == "wd" \|\|
	Constraint == "wf" \|\| Constraint == "ws" \|\|
	Constraint == "wi" \|\| Constraint == "ww") {
	return C_RegisterClass; // VSX registers.
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	PPCTargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();

	// Look at the constraint type.
	if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
	return CW_Register; // an individual CR bit.
	else if ((StringRef(constraint) == "wa" \|\|
	StringRef(constraint) == "wd" \|\|
	StringRef(constraint) == "wf") &&
	type->isVectorTy())
	return CW_Register;
	else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
	return CW_Register; // just hold 64-bit integers data.
	else if (StringRef(constraint) == "ws" && type->isDoubleTy())
	return CW_Register;
	else if (StringRef(constraint) == "ww" && type->isFloatTy())
	return CW_Register;

	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'b':
	if (type->isIntegerTy())
	weight = CW_Register;
	break;
	case 'f':
	if (type->isFloatTy())
	weight = CW_Register;
	break;
	case 'd':
	if (type->isDoubleTy())
	weight = CW_Register;
	break;
	case 'v':
	if (type->isVectorTy())
	weight = CW_Register;
	break;
	case 'y':
	weight = CW_Register;
	break;
	case 'Z':
	weight = CW_Memory;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	if (Constraint.size() == 1) {
	// GCC RS6000 Constraint Letters
	switch (Constraint[0]) {
	case 'b': // R1-R31
	if (VT == MVT::i64 && Subtarget.isPPC64())
	return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
	return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
	case 'r': // R0-R31
	if (VT == MVT::i64 && Subtarget.isPPC64())
	return std::make_pair(0U, &PPC::G8RCRegClass);
	return std::make_pair(0U, &PPC::GPRCRegClass);
	// 'd' and 'f' constraints are both defined to be "the floating point
	// registers", where one is for 32-bit and the other for 64-bit. We don't
	// really care overly much here so just give them all the same reg classes.
	case 'd':
	case 'f':
	if (Subtarget.hasSPE()) {
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &PPC::GPRCRegClass);
	if (VT == MVT::f64 \|\| VT == MVT::i64)
	return std::make_pair(0U, &PPC::SPERCRegClass);
	} else {
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &PPC::F4RCRegClass);
	if (VT == MVT::f64 \|\| VT == MVT::i64)
	return std::make_pair(0U, &PPC::F8RCRegClass);
	if (VT == MVT::v4f64 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QFRCRegClass);
	if (VT == MVT::v4f32 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QSRCRegClass);
	}
	break;
	case 'v':
	if (VT == MVT::v4f64 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QFRCRegClass);
	if (VT == MVT::v4f32 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QSRCRegClass);
	if (Subtarget.hasAltivec())
	return std::make_pair(0U, &PPC::VRRCRegClass);
	break;
	case 'y': // crrc
	return std::make_pair(0U, &PPC::CRRCRegClass);
	}
	} else if (Constraint == "wc" && Subtarget.useCRBits()) {
	// An individual CR bit.
	return std::make_pair(0U, &PPC::CRBITRCRegClass);
	} else if ((Constraint == "wa" \|\| Constraint == "wd" \|\|
	Constraint == "wf" \|\| Constraint == "wi") &&
	Subtarget.hasVSX()) {
	return std::make_pair(0U, &PPC::VSRCRegClass);
	} else if ((Constraint == "ws" \|\| Constraint == "ww") && Subtarget.hasVSX()) {
	if (VT == MVT::f32 && Subtarget.hasP8Vector())
	return std::make_pair(0U, &PPC::VSSRCRegClass);
	else
	return std::make_pair(0U, &PPC::VSFRCRegClass);
	}

	// If we name a VSX register, we can't defer to the base class because it
	// will not recognize the correct register (their names will be VSL{0-31}
	// and V{0-31} so they won't match). So we match them here.
	if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
	int VSNum = atoi(Constraint.data() + 3);
	assert(VSNum >= 0 && VSNum <= 63 &&
	"Attempted to access a vsr out of range");
	if (VSNum < 32)
	return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
	return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
	}
	std::pair<unsigned, const TargetRegisterClass *> R =
	TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
	// (which we call X[0-9]+). If a 64-bit value has been requested, and a
	// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
	// register.
	// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
	// the AsmName field from *RegisterInfo.td, then this would not be necessary.
	if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
	PPC::GPRCRegClass.contains(R.first))
	return std::make_pair(TRI->getMatchingSuperReg(R.first,
	PPC::sub_32, &PPC::G8RCRegClass),
	&PPC::G8RCRegClass);

	// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
	if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
	R.first = PPC::CR0;
	R.second = &PPC::CRRCRegClass;
	}

	return R;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints.
	if (Constraint.length() > 1) return;

	char Letter = Constraint[0];
	switch (Letter) {
	default: break;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P': {
	ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
	if (!CST) return; // Must be an immediate to match.
	SDLoc dl(Op);
	int64_t Value = CST->getSExtValue();
	EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
	// numbers are printed as such.
	switch (Letter) {
	default: llvm_unreachable("Unknown constraint letter!");
	case 'I': // "I" is a signed 16-bit constant.
	if (isInt<16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
	if (isShiftedUInt<16, 16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
	if (isShiftedInt<16, 16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
	if (isUInt<16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'M': // "M" is a constant that is greater than 31.
	if (Value > 31)
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'N': // "N" is a positive constant that is an exact power of two.
	if (Value > 0 && isPowerOf2_64(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'O': // "O" is the constant zero.
	if (Value == 0)
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
	if (isInt<16>(-Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	}
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	// Handle standard constraint letters.
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	// isLegalAddressingMode - Return true if the addressing mode represented
	// by AM is legal for this target, for a load/store of the specified type.
	bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// PPC does not allow r+i addressing modes for vectors!
	if (Ty->isVectorTy() && AM.BaseOffs != 0)
	return false;

	// PPC allows a sign-extended 16-bit immediate field.
	if (AM.BaseOffs <= -(1LL << 16) \|\| AM.BaseOffs >= (1LL << 16)-1)
	return false;

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// PPC only support r+r,
	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
	return false;
	// Otherwise we have r+r or r+i.
	break;
	case 2:
	if (AM.HasBaseReg \|\| AM.BaseOffs) // 2r+r or 2r+i is not allowed.
	return false;
	// Allow 2*r as r+r.
	break;
	default:
	// No other scales are supported.
	return false;
	}

	return true;
	}

	SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	// Make sure the function does not optimize away the store of the RA to
	// the stack.
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setLRStoreRequired();
	bool isPPC64 = Subtarget.isPPC64();
	auto PtrVT = getPointerTy(MF.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset =
	DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
	isPPC64 ? MVT::i64 : MVT::i32);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address off the stack.
	SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;

	// Naked functions never have a frame pointer, and so we use r1. For all
	// other functions, this decision must be delayed until during PEI.
	unsigned FrameReg;
	if (MF.getFunction().hasFnAttribute(Attribute::Naked))
	FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
	else
	FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;

	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
	PtrVT);
	while (Depth--)
	FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
	FrameAddr, MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const {
	bool isPPC64 = Subtarget.isPPC64();
	bool IsDarwinABI = Subtarget.isDarwinABI();

	bool is64Bit = isPPC64 && VT == LLT::scalar(64);
	if (!is64Bit && VT != LLT::scalar(32))
	report_fatal_error("Invalid register global variable type");

	Register Reg = StringSwitch<Register>(RegName)
	.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
	.Case("r2", (IsDarwinABI \|\| isPPC64) ? Register() : PPC::R2)
	.Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :
	(is64Bit ? PPC::X13 : PPC::R13))
	.Default(Register());

	if (Reg)
	return Reg;
	report_fatal_error("Invalid register name global variable");
	}

	bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
	// 32-bit SVR4 ABI access everything as got-indirect.
	if (Subtarget.is32BitELFABI())
	return true;

	// AIX accesses everything indirectly through the TOC, which is similar to
	// the GOT.
	if (Subtarget.isAIXABI())
	return true;

	CodeModel::Model CModel = getTargetMachine().getCodeModel();
	// If it is small or large code model, module locals are accessed
	// indirectly by loading their address from .toc/.got.
	if (CModel == CodeModel::Small \|\| CModel == CodeModel::Large)
	return true;

	// JumpTable and BlockAddress are accessed as got-indirect.
	if (isa<JumpTableSDNode>(GA) \|\| isa<BlockAddressSDNode>(GA))
	return true;

	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
	return Subtarget.isGVIndirectSymbol(G->getGlobal());

	return false;
	}

	bool
	PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// The PowerPC target isn't yet aware of offsets.
	return false;
	}

	bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvlfd:
	case Intrinsic::ppc_qpx_qvlfs:
	case Intrinsic::ppc_qpx_qvlfcd:
	case Intrinsic::ppc_qpx_qvlfcs:
	case Intrinsic::ppc_qpx_qvlfiwa:
	case Intrinsic::ppc_qpx_qvlfiwz:
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	case Intrinsic::ppc_altivec_lvebx:
	case Intrinsic::ppc_altivec_lvehx:
	case Intrinsic::ppc_altivec_lvewx:
	case Intrinsic::ppc_vsx_lxvd2x:
	case Intrinsic::ppc_vsx_lxvw4x: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_altivec_lvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_lvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_lvewx:
	VT = MVT::i32;
	break;
	case Intrinsic::ppc_vsx_lxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfd:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfs:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcd:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcs:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = -VT.getStoreSize()+1;
	Info.size = 2*VT.getStoreSize()-1;
	Info.align = Align::None();
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::ppc_qpx_qvlfda:
	case Intrinsic::ppc_qpx_qvlfsa:
	case Intrinsic::ppc_qpx_qvlfcda:
	case Intrinsic::ppc_qpx_qvlfcsa:
	case Intrinsic::ppc_qpx_qvlfiwaa:
	case Intrinsic::ppc_qpx_qvlfiwza: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvlfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcsa:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.size = VT.getStoreSize();
	Info.align = Align::None();
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::ppc_qpx_qvstfd:
	case Intrinsic::ppc_qpx_qvstfs:
	case Intrinsic::ppc_qpx_qvstfcd:
	case Intrinsic::ppc_qpx_qvstfcs:
	case Intrinsic::ppc_qpx_qvstfiw:
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	case Intrinsic::ppc_altivec_stvebx:
	case Intrinsic::ppc_altivec_stvehx:
	case Intrinsic::ppc_altivec_stvewx:
	case Intrinsic::ppc_vsx_stxvd2x:
	case Intrinsic::ppc_vsx_stxvw4x: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_altivec_stvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_stvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_stvewx:
	VT = MVT::i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfd:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcd:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcs:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = -VT.getStoreSize()+1;
	Info.size = 2*VT.getStoreSize()-1;
	Info.align = Align::None();
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::ppc_qpx_qvstfda:
	case Intrinsic::ppc_qpx_qvstfsa:
	case Intrinsic::ppc_qpx_qvstfcda:
	case Intrinsic::ppc_qpx_qvstfcsa:
	case Intrinsic::ppc_qpx_qvstfiwa: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvstfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcsa:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.size = VT.getStoreSize();
	Info.align = Align::None();
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	default:
	break;
	}

	return false;
	}

	/// getOptimalMemOpType - Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT PPCTargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
	// When expanding a memset, require at least two QPX instructions to cover
	// the cost of loading the value to be stored from the constant pool.
	if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset \|\| Size >= 64) &&
	(!SrcAlign \|\| SrcAlign >= 32) && (!DstAlign \|\| DstAlign >= 32) &&
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	return MVT::v4f64;
	}

	// We should use Altivec/VSX loads and stores when available. For unaligned
	// addresses, unaligned VSX loads are only fast starting with the P8.
	if (Subtarget.hasAltivec() && Size >= 16 &&
	(((!SrcAlign \|\| SrcAlign >= 16) && (!DstAlign \|\| DstAlign >= 16)) \|\|
	((IsMemset && Subtarget.hasVSX()) \|\| Subtarget.hasP8Vector())))
	return MVT::v4i32;
	}

	if (Subtarget.isPPC64()) {
	return MVT::i64;
	}

	return MVT::i32;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	return !(BitSize == 0 \|\| BitSize > 64);
	}

	bool PPCTargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 64 && NumBits2 == 32;
	}

	bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 64 && NumBits2 == 32;
	}

	bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	// Generally speaking, zexts are not free, but they are free when they can be
	// folded with other operations.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
	EVT MemVT = LD->getMemoryVT();
	if ((MemVT == MVT::i1 \|\| MemVT == MVT::i8 \|\| MemVT == MVT::i16 \|\|
	(Subtarget.isPPC64() && MemVT == MVT::i32)) &&
	(LD->getExtensionType() == ISD::NON_EXTLOAD \|\|
	LD->getExtensionType() == ISD::ZEXTLOAD))
	return true;
	}

	// FIXME: Add other cases...
	// - 32-bit shifts with a zext to i64
	// - zext after ctlz, bswap, etc.
	// - zext after and by a constant mask

	return TargetLowering::isZExtFree(Val, VT2);
	}

	bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
	assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
	"invalid fpext types");
	// Extending to float128 is not free.
	if (DestVT == MVT::f128)
	return false;
	return true;
	}

	bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<16>(Imm) \|\| isUInt<16>(Imm);
	}

	bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	return isInt<16>(Imm) \|\| isUInt<16>(Imm);
	}

	bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	MachineMemOperand::Flags,
	bool *Fast) const {
	if (DisablePPCUnaligned)
	return false;

	// PowerPC supports unaligned memory access for simple non-vector types.
	// Although accessing unaligned addresses is not as efficient as accessing
	// aligned addresses, it is generally more efficient than manual expansion,
	// and generally only traps for software emulation when crossing page
	// boundaries.

	if (!VT.isSimple())
	+ return false;
	+
	+ if (VT.isFloatingPoint() && !VT.isVector() &&
	+ !Subtarget.allowsUnalignedFPAccess())
	return false;

	if (VT.getSimpleVT().isVector()) {
	if (Subtarget.hasVSX()) {
	if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
	VT != MVT::v4f32 && VT != MVT::v4i32)
	return false;
	} else {
	return false;
	}
	}

	if (VT == MVT::ppcf128)
	return false;

	if (Fast)
	*Fast = true;

	return true;
	}

	bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	case MVT::f128:
	return (EnableQuadPrecision && Subtarget.hasP9Vector());
	default:
	break;
	}

	return false;
	}

	const MCPhysReg *
	PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints. The same reasoning applies
	// to CTR, which is used by any indirect call.
	static const MCPhysReg ScratchRegs[] = {
	PPC::X12, PPC::LR8, PPC::CTR8, 0
	};

	return ScratchRegs;
	}

	unsigned PPCTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
	}

	unsigned PPCTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
	}

	bool
	PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
	EVT VT , unsigned DefinedValues) const {
	if (VT == MVT::v2i64)
	return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves

	if (Subtarget.hasVSX() \|\| Subtarget.hasQPX())
	return true;

	return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
	}

	Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
	if (DisableILPPref \|\| Subtarget.enableMachineScheduler())
	return TargetLowering::getSchedulingPreference(N);

	return Sched::ILP;
	}

	// Create a fast isel object.
	FastISel *
	PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo) const {
	return PPC::createFastISel(FuncInfo, LibInfo);
	}

	void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (Subtarget.isDarwinABI()) return;
	if (!Subtarget.isPPC64()) return;

	// Update IsSplitCSR in PPCFunctionInfo
	PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
	PFI->setIsSplitCSR(true);
	}

	void PPCTargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (PPC::G8RCRegClass.contains(*I))
	RC = &PPC::G8RCRegClass;
	else if (PPC::F8RCRegClass.contains(*I))
	RC = &PPC::F8RCRegClass;
	else if (PPC::CRRCRegClass.contains(*I))
	RC = &PPC::CRRCRegClass;
	else if (PPC::VRRCRegClass.contains(*I))
	RC = &PPC::VRRCRegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	Register NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	// Override to enable LOAD_STACK_GUARD lowering on Linux.
	bool PPCTargetLowering::useLoadStackGuardNode() const {
	if (!Subtarget.isTargetLinux())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	// Override to disable global variable loading on Linux.
	void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
	if (!Subtarget.isTargetLinux())
	return TargetLowering::insertSSPDeclarations(M);
	}

	bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	if (!VT.isSimple() \|\| !Subtarget.hasVSX())
	return false;

	switch(VT.getSimpleVT().SimpleTy) {
	default:
	// For FP types that are currently not supported by PPC backend, return
	// false. Examples: f16, f80.
	return false;
	case MVT::f32:
	case MVT::f64:
	case MVT::ppcf128:
	return Imm.isPosZero();
	}
	}

	// For vector shift operation op, fold
	// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
	static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
	SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	unsigned Opcode = N->getOpcode();
	unsigned TargetOpcode;

	switch (Opcode) {
	default:
	llvm_unreachable("Unexpected shift operation");
	case ISD::SHL:
	TargetOpcode = PPCISD::SHL;
	break;
	case ISD::SRL:
	TargetOpcode = PPCISD::SRL;
	break;
	case ISD::SRA:
	TargetOpcode = PPCISD::SRA;
	break;
	}

	if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
	N1->getOpcode() == ISD::AND)
	if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
	if (Mask->getZExtValue() == OpSizeInBits - 1)
	return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	SDValue N0 = N->getOperand(0);
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!Subtarget.isISA3_0() \|\|
	N0.getOpcode() != ISD::SIGN_EXTEND \|\|
	N0.getOperand(0).getValueType() != MVT::i32 \|\|
	CN1 == nullptr \|\| N->getValueType(0) != MVT::i64)
	return SDValue();

	// We can't save an operation here if the value is already extended, and
	// the existing shift is easier to combine.
	SDValue ExtsSrc = N0.getOperand(0);
	if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
	ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)
	return SDValue();

	SDLoc DL(N0);
	SDValue ShiftBy = SDValue(CN1, 0);
	// We want the shift amount to be i32 on the extswli, but the shift could
	// have an i64.
	if (ShiftBy.getValueType() == MVT::i64)
	ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);

	return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),
	ShiftBy);
	}

	SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
	// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
	// When C is zero, the equation (addi Z, -C) can be simplified to Z
	// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
	static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
	const PPCSubtarget &Subtarget) {
	if (!Subtarget.isPPC64())
	return SDValue();

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	auto isZextOfCompareWithConstant = [](SDValue Op) {
	if (Op.getOpcode() != ISD::ZERO_EXTEND \|\| !Op.hasOneUse() \|\|
	Op.getValueType() != MVT::i64)
	return false;

	SDValue Cmp = Op.getOperand(0);
	if (Cmp.getOpcode() != ISD::SETCC \|\| !Cmp.hasOneUse() \|\|
	Cmp.getOperand(0).getValueType() != MVT::i64)
	return false;

	if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {
	int64_t NegConstant = 0 - Constant->getSExtValue();
	// Due to the limitations of the addi instruction,
	// -C is required to be [-32768, 32767].
	return isInt<16>(NegConstant);
	}

	return false;
	};

	bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
	bool RHSHasPattern = isZextOfCompareWithConstant(RHS);

	// If there is a pattern, canonicalize a zext operand to the RHS.
	if (LHSHasPattern && !RHSHasPattern)
	std::swap(LHS, RHS);
	else if (!LHSHasPattern && !RHSHasPattern)
	return SDValue();

	SDLoc DL(N);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
	SDValue Cmp = RHS.getOperand(0);
	SDValue Z = Cmp.getOperand(0);
	auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));

	assert(Constant && "Constant Should not be a null pointer.");
	int64_t NegConstant = 0 - Constant->getSExtValue();

	switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
	default: break;
	case ISD::SETNE: {
	// when C == 0
	// --> addze X, (addic Z, -1).carry
	// /
	// add X, (zext(setne Z, C))--
	// \ when -32768 <= -C <= 32767 && C != 0
	// --> addze X, (addic (addi Z, -C), -1).carry
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
	DAG.getConstant(NegConstant, DL, MVT::i64));
	SDValue AddOrZ = NegConstant != 0 ? Add : Z;
	SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
	AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));
	return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
	SDValue(Addc.getNode(), 1));
	}
	case ISD::SETEQ: {
	// when C == 0
	// --> addze X, (subfic Z, 0).carry
	// /
	// add X, (zext(sete Z, C))--
	// \ when -32768 <= -C <= 32767 && C != 0
	// --> addze X, (subfic (addi Z, -C), 0).carry
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
	DAG.getConstant(NegConstant, DL, MVT::i64));
	SDValue AddOrZ = NegConstant != 0 ? Add : Z;
	SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
	DAG.getConstant(0, DL, MVT::i64), AddOrZ);
	return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
	SDValue(Subc.getNode(), 1));
	}
	}

	return SDValue();
	}

	SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
	return Value;

	return SDValue();
	}

	// Detect TRUNCATE operations on bitcasts of float128 values.
	// What we are looking for here is the situtation where we extract a subset
	// of bits from a 128 bit float.
	// This can be of two forms:
	// 1) BITCAST of f128 feeding TRUNCATE
	// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
	// The reason this is required is because we do not have a legal i128 type
	// and so we want to prevent having to store the f128 and then reload part
	// of it.
	SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	// If we are using CRBits then try that first.
	if (Subtarget.useCRBits()) {
	// Check if CRBits did anything and return that if it did.
	if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
	return CRTruncValue;
	}

	SDLoc dl(N);
	SDValue Op0 = N->getOperand(0);

	// Looking for a truncate of i128 to i64.
	if (Op0.getValueType() != MVT::i128 \|\| N->getValueType(0) != MVT::i64)
	return SDValue();

	int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;

	// SRL feeding TRUNCATE.
	if (Op0.getOpcode() == ISD::SRL) {
	ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
	// The right shift has to be by 64 bits.
	if (!ConstNode \|\| ConstNode->getZExtValue() != 64)
	return SDValue();

	// Switch the element number to extract.
	EltToExtract = EltToExtract ? 0 : 1;
	// Update Op0 past the SRL.
	Op0 = Op0.getOperand(0);
	}

	// BITCAST feeding a TRUNCATE possibly via SRL.
	if (Op0.getOpcode() == ISD::BITCAST &&
	Op0.getValueType() == MVT::i128 &&
	Op0.getOperand(0).getValueType() == MVT::f128) {
	SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
	return DCI.DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
	DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
	if (!ConstOpOrElement)
	return SDValue();

	// An imul is usually smaller than the alternative sequence for legal type.
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	isOperationLegal(ISD::MUL, N->getValueType(0)))
	return SDValue();

	auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
	switch (this->Subtarget.getCPUDirective()) {
	default:
	// TODO: enhance the condition for subtarget before pwr8
	return false;
	case PPC::DIR_PWR8:
	// type mul add shl
	// scalar 4 1 1
	// vector 7 2 2
	return true;
	case PPC::DIR_PWR9:
	case PPC::DIR_PWR_FUTURE:
	// type mul add shl
	// scalar 5 2 2
	// vector 7 2 2

	// The cycle RATIO of related operations are showed as a table above.
	// Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
	// scalar and vector type. For 2 instrs patterns, add/sub + shl
	// are 4, it is always profitable; but for 3 instrs patterns
	// (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
	// So we should only do it for vector type.
	return IsAddOne && IsNeg ? VT.isVector() : true;
	}
	};

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
	bool IsNeg = MulAmt.isNegative();
	APInt MulAmtAbs = MulAmt.abs();

	if ((MulAmtAbs - 1).isPowerOf2()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, -(2^N + 1)) => -(add (shl x, N), x)

	if (!IsProfitable(IsNeg, true, VT))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 =
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
	SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);

	if (!IsNeg)
	return Res;

	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	} else if ((MulAmtAbs + 1).isPowerOf2()) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))

	if (!IsProfitable(IsNeg, false, VT))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 =
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));

	if (!IsNeg)
	return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
	else
	return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);

	} else {
	return SDValue();
	}
	}

	bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
	if (!Subtarget.is64BitELFABI())
	return false;

	// If not a tail call then no need to proceed.
	if (!CI->isTailCall())
	return false;

	// If sibling calls have been disabled and tail-calls aren't guaranteed
	// there is no reason to duplicate.
	auto &TM = getTargetMachine();
	if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
	return false;

	// Can't tail call a function called indirectly, or if it has variadic args.
	const Function *Callee = CI->getCalledFunction();
	if (!Callee \|\| Callee->isVarArg())
	return false;

	// Make sure the callee and caller calling conventions are eligible for tco.
	const Function *Caller = CI->getParent()->getParent();
	if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
	CI->getCallingConv()))
	return false;

	// If the function is local then we have a good chance at tail-calling it
	return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
	}

	bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
	if (!Subtarget.hasVSX())
	return false;
	if (Subtarget.hasP9Vector() && VT == MVT::f128)
	return true;
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\|
	VT == MVT::v4f32 \|\| VT == MVT::v2f64;
	}

	bool PPCTargetLowering::
	isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
	const Value *Mask = AndI.getOperand(1);
	// If the mask is suitable for andi. or andis. we should sink the and.
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
	// Can't handle constants wider than 64-bits.
	if (CI->getBitWidth() > 64)
	return false;
	int64_t ConstVal = CI->getZExtValue();
	return isUInt<16>(ConstVal) \|\|
	(isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
	}

	// For non-constant masks, we can always use the record-form and.
	return true;
	}

	// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
	// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
	// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
	// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
	// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
	SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
	assert(Subtarget.hasP9Altivec() &&
	"Only combine this when P9 altivec supported!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	if (N->getOperand(0).getOpcode() == ISD::SUB) {
	// Even for signed integers, if it's known to be positive (as signed
	// integer) due to zero-extended inputs.
	unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
	unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
	if ((SubOpcd0 == ISD::ZERO_EXTEND \|\|
	SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) &&
	(SubOpcd1 == ISD::ZERO_EXTEND \|\|
	SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) {
	return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
	N->getOperand(0)->getOperand(0),
	N->getOperand(0)->getOperand(1),
	DAG.getTargetConstant(0, dl, MVT::i32));
	}

	// For type v4i32, it can be optimized with xvnegsp + vabsduw
	if (N->getOperand(0).getValueType() == MVT::v4i32 &&
	N->getOperand(0).hasOneUse()) {
	return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
	N->getOperand(0)->getOperand(0),
	N->getOperand(0)->getOperand(1),
	DAG.getTargetConstant(1, dl, MVT::i32));
	}
	}

	return SDValue();
	}

	// For type v4i32/v8ii16/v16i8, transform
	// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b)
	// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b)
	// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b)
	// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b)
	SDValue PPCTargetLowering::combineVSelect(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here");
	assert(Subtarget.hasP9Altivec() &&
	"Only combine this when P9 altivec supported!");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Cond = N->getOperand(0);
	SDValue TrueOpnd = N->getOperand(1);
	SDValue FalseOpnd = N->getOperand(2);
	EVT VT = N->getOperand(1).getValueType();

	if (Cond.getOpcode() != ISD::SETCC \|\| TrueOpnd.getOpcode() != ISD::SUB \|\|
	FalseOpnd.getOpcode() != ISD::SUB)
	return SDValue();

	// ABSD only available for type v4i32/v8i16/v16i8
	if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
	return SDValue();

	// At least to save one more dependent computation
	if (!(Cond.hasOneUse() \|\| TrueOpnd.hasOneUse() \|\| FalseOpnd.hasOneUse()))
	return SDValue();

	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Can only handle unsigned comparison here
	switch (CC) {
	default:
	return SDValue();
	case ISD::SETUGT:
	case ISD::SETUGE:
	break;
	case ISD::SETULT:
	case ISD::SETULE:
	std::swap(TrueOpnd, FalseOpnd);
	break;
	}

	SDValue CmpOpnd1 = Cond.getOperand(0);
	SDValue CmpOpnd2 = Cond.getOperand(1);

	// SETCC CmpOpnd1 CmpOpnd2 cond
	// TrueOpnd = CmpOpnd1 - CmpOpnd2
	// FalseOpnd = CmpOpnd2 - CmpOpnd1
	if (TrueOpnd.getOperand(0) == CmpOpnd1 &&
	TrueOpnd.getOperand(1) == CmpOpnd2 &&
	FalseOpnd.getOperand(0) == CmpOpnd2 &&
	FalseOpnd.getOperand(1) == CmpOpnd1) {
	return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(),
	CmpOpnd1, CmpOpnd2,
	DAG.getTargetConstant(0, dl, MVT::i32));
	}

	return SDValue();
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h (revision 362609)
	@@ -1,1263 +1,1263 @@
	//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that PPC uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
	#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H

	#include "PPCInstrInfo.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/MachineValueType.h"
	#include <utility>

	namespace llvm {

	namespace PPCISD {

	// When adding a NEW PPCISD node please add it to the correct position in
	// the enum. The order of elements in this enum matters!
	// Values that are added after this entry:
	// STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE
	// are considered memory opcodes and are treated differently than entries
	// that come before it. For example, ADD or MUL should be placed before
	// the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come
	// after it.
	enum NodeType : unsigned {
	// Start the numbering where the builtin ops and target ops leave off.
	FIRST_NUMBER = ISD::BUILTIN_OP_END,

	/// FSEL - Traditional three-operand fsel node.
	///
	FSEL,

	/// XSMAXCDP, XSMINCDP - C-type min/max instructions.
	XSMAXCDP,
	XSMINCDP,

	/// FCFID - The FCFID instruction, taking an f64 operand and producing
	/// and f64 value containing the FP representation of the integer that
	/// was temporarily in the f64 operand.
	FCFID,

	/// Newer FCFID[US] integer-to-floating-point conversion instructions for
	/// unsigned integers and single-precision outputs.
	FCFIDU,
	FCFIDS,
	FCFIDUS,

	/// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
	/// operand, producing an f64 value containing the integer representation
	/// of that FP value.
	FCTIDZ,
	FCTIWZ,

	/// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
	/// unsigned integers with round toward zero.
	FCTIDUZ,
	FCTIWUZ,

	/// Floating-point-to-interger conversion instructions
	FP_TO_UINT_IN_VSR,
	FP_TO_SINT_IN_VSR,

	/// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
	/// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
	VEXTS,

	/// SExtVElems, takes an input vector of a smaller type and sign
	/// extends to an output vector of a larger type.
	SExtVElems,

	/// Reciprocal estimate instructions (unary FP ops).
	FRE,
	FRSQRTE,

	// VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
	// three v4f32 operands and producing a v4f32 result.
	VMADDFP,
	VNMSUBFP,

	/// VPERM - The PPC VPERM Instruction.
	///
	VPERM,

	/// XXSPLT - The PPC VSX splat instructions
	///
	XXSPLT,

	/// VECINSERT - The PPC vector insert instruction
	///
	VECINSERT,

	/// VECSHL - The PPC vector shift left instruction
	///
	VECSHL,

	/// XXPERMDI - The PPC XXPERMDI instruction
	///
	XXPERMDI,

	/// The CMPB instruction (takes two operands of i32 or i64).
	CMPB,

	/// Hi/Lo - These represent the high and low 16-bit parts of a global
	/// address respectively. These nodes have two operands, the first of
	/// which must be a TargetGlobalAddress, and the second of which must be a
	/// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C',
	/// though these are usually folded into other nodes.
	Hi,
	Lo,

	/// The following two target-specific nodes are used for calls through
	/// function pointers in the 64-bit SVR4 ABI.

	/// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
	/// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
	/// compute an allocation on the stack.
	DYNALLOC,

	/// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
	/// compute an offset from native SP to the address of the most recent
	/// dynamic alloca.
	DYNAREAOFFSET,

	/// GlobalBaseReg - On Darwin, this node represents the result of the mflr
	/// at function entry, used for PIC code.
	GlobalBaseReg,

	/// These nodes represent PPC shifts.
	///
	/// For scalar types, only the last `n + 1` bits of the shift amounts
	/// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
	/// for exact behaviors.
	///
	/// For vector types, only the last n bits are used. See vsld.
	SRL,
	SRA,
	SHL,

	/// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
	/// word and shift left immediate.
	EXTSWSLI,

	/// The combination of sra[wd]i and addze used to implemented signed
	/// integer division by a power of 2. The first operand is the dividend,
	/// and the second is the constant shift amount (representing the
	/// divisor).
	SRA_ADDZE,

	/// CALL - A direct function call.
	/// CALL_NOP is a call with the special NOP which follows 64-bit
	/// SVR4 calls and 32-bit/64-bit AIX calls.
	CALL,
	CALL_NOP,

	/// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
	/// MTCTR instruction.
	MTCTR,

	/// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
	/// BCTRL instruction.
	BCTRL,

	/// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
	/// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
	/// and 64-bit AIX.
	BCTRL_LOAD_TOC,

	/// Return with a flag operand, matched by 'blr'
	RET_FLAG,

	/// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
	/// This copies the bits corresponding to the specified CRREG into the
	/// resultant GPR. Bits corresponding to other CR regs are undefined.
	MFOCRF,

	/// Direct move from a VSX register to a GPR
	MFVSR,

	/// Direct move from a GPR to a VSX register (algebraic)
	MTVSRA,

	/// Direct move from a GPR to a VSX register (zero)
	MTVSRZ,

	/// Direct move of 2 consecutive GPR to a VSX register.
	BUILD_FP128,

	/// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
	/// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
	/// unsupported for this target.
	/// Merge 2 GPRs to a single SPE register.
	BUILD_SPE64,

	/// Extract SPE register component, second argument is high or low.
	EXTRACT_SPE,

	/// Extract a subvector from signed integer vector and convert to FP.
	/// It is primarily used to convert a (widened) illegal integer vector
	/// type to a legal floating point vector type.
	/// For example v2i32 -> widened to v4i32 -> v2f64
	SINT_VEC_TO_FP,

	/// Extract a subvector from unsigned integer vector and convert to FP.
	/// As with SINT_VEC_TO_FP, used for converting illegal types.
	UINT_VEC_TO_FP,

	// FIXME: Remove these once the ANDI glue bug is fixed:
	/// i1 = ANDI_rec_1_[EQ\|GT]_BIT(i32 or i64 x) - Represents the result of the
	/// eq or gt bit of CR0 after executing andi. x, 1. This is used to
	/// implement truncation of i32 or i64 to i1.
	ANDI_rec_1_EQ_BIT,
	ANDI_rec_1_GT_BIT,

	// READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
	// target (returns (Lo, Hi)). It takes a chain operand.
	READ_TIME_BASE,

	// EH_SJLJ_SETJMP - SjLj exception handling setjmp.
	EH_SJLJ_SETJMP,

	// EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
	EH_SJLJ_LONGJMP,

	/// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
	/// instructions. For lack of better number, we use the opcode number
	/// encoding for the OPC field to identify the compare. For example, 838
	/// is VCMPGTSH.
	VCMP,

	/// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
	/// altivec VCMP*o instructions. For lack of better number, we use the
	/// opcode number encoding for the OPC field to identify the compare. For
	/// example, 838 is VCMPGTSH.
	VCMPo,

	/// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
	/// corresponds to the COND_BRANCH pseudo instruction. CRRC is the
	/// condition register to branch on, OPC is the branch opcode to use (e.g.
	/// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
	/// an optional input flag argument.
	COND_BRANCH,

	/// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
	/// loops.
	BDNZ,
	BDZ,

	/// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
	/// towards zero. Used only as part of the long double-to-int
	/// conversion sequence.
	FADDRTZ,

	/// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
	MFFS,

	/// TC_RETURN - A tail call return.
	/// operand #0 chain
	/// operand #1 callee (register or absolute)
	/// operand #2 stack adjustment
	/// operand #3 optional in flag
	TC_RETURN,

	/// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
	CR6SET,
	CR6UNSET,

	/// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
	/// for non-position independent code on PPC32.
	PPC32_GOT,

	/// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
	/// local dynamic TLS and position indendepent code on PPC32.
	PPC32_PICGOT,

	/// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
	/// TLS model, produces an ADDIS8 instruction that adds the GOT
	/// base to sym\@got\@tprel\@ha.
	ADDIS_GOT_TPREL_HA,

	/// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
	/// TLS model, produces a LD instruction with base register G8RReg
	/// and offset sym\@got\@tprel\@l. This completes the addition that
	/// finds the offset of "sym" relative to the thread pointer.
	LD_GOT_TPREL_L,

	/// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
	/// model, produces an ADD instruction that adds the contents of
	/// G8RReg to the thread pointer. Symbol contains a relocation
	/// sym\@tls which is to be replaced by the thread pointer and
	/// identifies to the linker that the instruction is part of a
	/// TLS sequence.
	ADD_TLS,

	/// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
	/// model, produces an ADDIS8 instruction that adds the GOT base
	/// register to sym\@got\@tlsgd\@ha.
	ADDIS_TLSGD_HA,

	/// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
	/// model, produces an ADDI8 instruction that adds G8RReg to
	/// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by
	/// ADDIS_TLSGD_L_ADDR until after register assignment.
	ADDI_TLSGD_L,

	/// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
	/// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by
	/// ADDIS_TLSGD_L_ADDR until after register assignment.
	GET_TLS_ADDR,

	/// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
	/// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
	/// register assignment.
	ADDI_TLSGD_L_ADDR,

	/// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
	/// model, produces an ADDIS8 instruction that adds the GOT base
	/// register to sym\@got\@tlsld\@ha.
	ADDIS_TLSLD_HA,

	/// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
	/// model, produces an ADDI8 instruction that adds G8RReg to
	/// sym\@got\@tlsld\@l and stores the result in X3. Hidden by
	/// ADDIS_TLSLD_L_ADDR until after register assignment.
	ADDI_TLSLD_L,

	/// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
	/// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by
	/// ADDIS_TLSLD_L_ADDR until after register assignment.
	GET_TLSLD_ADDR,

	/// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
	/// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
	/// following register assignment.
	ADDI_TLSLD_L_ADDR,

	/// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
	/// model, produces an ADDIS8 instruction that adds X3 to
	/// sym\@dtprel\@ha.
	ADDIS_DTPREL_HA,

	/// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
	/// model, produces an ADDI8 instruction that adds G8RReg to
	/// sym\@got\@dtprel\@l.
	ADDI_DTPREL_L,

	/// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
	/// during instruction selection to optimize a BUILD_VECTOR into
	/// operations on splats. This is necessary to avoid losing these
	/// optimizations due to constant folding.
	VADD_SPLAT,

	/// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned
	/// operand identifies the operating system entry point.
	SC,

	/// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer.
	CLRBHRB,

	/// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch
	/// history rolling buffer entry.
	MFBHRBE,

	/// CHAIN = RFEBB CHAIN, State - Return from event-based branch.
	RFEBB,

	/// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
	/// endian. Maps to an xxswapd instruction that corrects an lxvd2x
	/// or stxvd2x instruction. The chain is necessary because the
	/// sequence replaces a load and needs to provide the same number
	/// of outputs.
	XXSWAPD,

	/// An SDNode for swaps that are not associated with any loads/stores
	/// and thereby have no chain.
	SWAP_NO_CHAIN,

	/// An SDNode for Power9 vector absolute value difference.
	/// operand #0 vector
	/// operand #1 vector
	/// operand #2 constant i32 0 or 1, to indicate whether needs to patch
	/// the most significant bit for signed i32
	///
	/// Power9 VABSD* instructions are designed to support unsigned integer
	/// vectors (byte/halfword/word), if we want to make use of them for signed
	/// integer vectors, we have to flip their sign bits first. To flip sign bit
	/// for byte/halfword integer vector would become inefficient, but for word
	/// integer vector, we can leverage XVNEGSP to make it efficiently. eg:
	/// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000)
	/// => VABSDUW((XVNEGSP a), (XVNEGSP b))
	VABSD,

	/// QVFPERM = This corresponds to the QPX qvfperm instruction.
	QVFPERM,

	/// QVGPCI = This corresponds to the QPX qvgpci instruction.
	QVGPCI,

	/// QVALIGNI = This corresponds to the QPX qvaligni instruction.
	QVALIGNI,

	/// QVESPLATI = This corresponds to the QPX qvesplati instruction.
	QVESPLATI,

	/// QBFLT = Access the underlying QPX floating-point boolean
	/// representation.
	QBFLT,

	/// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
	/// lower (IDX=1) half of v4f32 to v2f64.
	FP_EXTEND_HALF,

	/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
	/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
	/// the GPRC input, then stores it through Ptr. Type can be either i16 or
	/// i32.
	STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE,

	/// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
	/// byte-swapping load instruction. It loads "Type" bits, byte swaps it,
	/// then puts it in the bottom bits of the GPRC. TYPE can be either i16
	/// or i32.
	LBRX,

	/// STFIWX - The STFIWX instruction. The first operand is an input token
	/// chain, then an f64 value to store, then an address to store it to.
	STFIWX,

	/// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
	/// load which sign-extends from a 32-bit integer value into the
	/// destination 64-bit register.
	LFIWAX,

	/// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
	/// load which zero-extends from a 32-bit integer value into the
	/// destination 64-bit register.
	LFIWZX,

	/// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
	/// integer smaller than 64 bits into a VSR. The integer is zero-extended.
	/// This can be used for converting loaded integers to floating point.
	LXSIZX,

	/// STXSIX - The STXSI[bh]X instruction. The first operand is an input
	/// chain, then an f64 value to store, then an address to store it to,
	/// followed by a byte-width for the store.
	STXSIX,

	/// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
	/// Maps directly to an lxvd2x instruction that will be followed by
	/// an xxswapd.
	LXVD2X,

	/// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
	/// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
	/// the vector type to load vector in big-endian element order.
	LOAD_VEC_BE,

	/// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
	/// v2f32 value into the lower half of a VSR register.
	LD_VSX_LH,

	/// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
	/// instructions such as LXVDSX, LXVWSX.
	LD_SPLAT,

	/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
	/// Maps directly to an stxvd2x instruction that will be preceded by
	/// an xxswapd.
	STXVD2X,

	/// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
	/// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
	/// the vector type to store vector in big-endian element order.
	STORE_VEC_BE,

	/// Store scalar integers from VSR.
	ST_VSR_SCAL_INT,

	/// QBRC, CHAIN = QVLFSb CHAIN, Ptr
	/// The 4xf32 load used for v4i1 constants.
	QVLFSb,

	/// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
	/// except they ensure that the compare input is zero-extended for
	/// sub-word versions because the atomic loads zero-extend.
	ATOMIC_CMP_SWAP_8,
	ATOMIC_CMP_SWAP_16,

	/// GPRC = TOC_ENTRY GA, TOC
	/// Loads the entry for GA from the TOC, where the TOC base is given by
	/// the last operand.
	TOC_ENTRY
	};

	} // end namespace PPCISD

	/// Define some predicates that are used for node matching.
	namespace PPC {

	/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUHUM instruction.
	bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUWUM instruction.
	bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUDUM instruction.
	bool isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
	bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG);

	/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
	bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG);

	/// isVMRGEOShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGEW or VMRGOW instruction
	bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
	unsigned ShuffleKind, SelectionDAG &DAG);
	/// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXSLDWI instruction.
	bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE);

	/// isXXBRHShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRH instruction.
	bool isXXBRHShuffleMask(ShuffleVectorSDNode *N);

	/// isXXBRWShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRW instruction.
	bool isXXBRWShuffleMask(ShuffleVectorSDNode *N);

	/// isXXBRDShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRD instruction.
	bool isXXBRDShuffleMask(ShuffleVectorSDNode *N);

	/// isXXBRQShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRQ instruction.
	bool isXXBRQShuffleMask(ShuffleVectorSDNode *N);

	/// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXPERMDI instruction.
	bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE);

	/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
	/// shift amount, otherwise return -1.
	int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
	/// specifies a splat of a single element that is suitable for input to
	/// VSPLTB/VSPLTH/VSPLTW.
	bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);

	/// isXXINSERTWMask - Return true if this VECTOR_SHUFFLE can be handled by
	/// the XXINSERTW instruction introduced in ISA 3.0. This is essentially any
	/// shuffle of v4f32/v4i32 vectors that just inserts one element from one
	/// vector into the other. This function will also set a couple of
	/// output parameters for how much the source vector needs to be shifted and
	/// what byte number needs to be specified for the instruction to put the
	/// element in the desired location of the target vector.
	bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	unsigned &InsertAtByte, bool &Swap, bool IsLE);

	/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
	/// appropriate for PPC mnemonics (which have a big endian bias - namely
	/// elements are counted from the left of the vector register).
	unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
	SelectionDAG &DAG);

	/// get_VSPLTI_elt - If this is a build_vector of constants which can be
	/// formed by using a vspltis[bhw] instruction of the specified element
	/// size, return the constant being splatted. The ByteSize field indicates
	/// the number of bytes of each element [124] -> [bhw].
	SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);

	/// If this is a qvaligni shuffle mask, return the shift
	/// amount, otherwise return -1.
	int isQVALIGNIShuffleMask(SDNode *N);

	} // end namespace PPC

	class PPCTargetLowering : public TargetLowering {
	const PPCSubtarget &Subtarget;

	public:
	explicit PPCTargetLowering(const PPCTargetMachine &TM,
	const PPCSubtarget &STI);

	/// getTargetNodeName() - This method returns the name of a target specific
	/// DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	bool isSelectSupported(SelectSupportKind Kind) const override {
	// PowerPC does not support scalar condition selects on vectors.
	return (Kind != SelectSupportKind::ScalarCondVectorVal);
	}

	/// getPreferredVectorAction - The code we generate when vector types are
	/// legalized by promoting the integer element type is often much worse
	/// than code we generate if we widen the type for applicable vector types.
	/// The issue with promoting is that the vector is scalaraized, individual
	/// elements promoted and then the vector is rebuilt. So say we load a pair
	/// of v4i8's and shuffle them. This will turn into a mess of 8 extending
	/// loads, moves back into VSR's (or memory ops if we don't have moves) and
	/// then the VPERM for the shuffle. All in all a very slow sequence.
	TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
	const override {
	- if (VT.getScalarSizeInBits() % 8 == 0)
	+ if (VT.getVectorNumElements() != 1 && VT.getScalarSizeInBits() % 8 == 0)
	return TypeWidenVector;
	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	bool useSoftFloat() const override;

	bool hasSPE() const;

	MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
	return MVT::i32;
	}

	bool isCheapToSpeculateCttz() const override {
	return true;
	}

	bool isCheapToSpeculateCtlz() const override {
	return true;
	}

	bool isCtlzFast() const override {
	return true;
	}

	bool isEqualityCmpFoldedWithSignedCmp() const override {
	return false;
	}

	bool hasAndNotCompare(SDValue) const override {
	return true;
	}

	bool preferIncOfAddToSubOfNot(EVT VT) const override;

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	bool supportSplitCSR(MachineFunction *MF) const override {
	return
	MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
	}

	void initializeSplitCSR(MachineBasicBlock *Entry) const override;

	void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

	/// getSetCCResultType - Return the ISD::SETCC ValueType
	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	/// Return true if target always beneficiates from combining into FMA for a
	/// given value type. This must typically return false on targets where FMA
	/// takes more cycles to execute than FADD.
	bool enableAggressiveFMAFusion(EVT VT) const override;

	/// getPreIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const override;

	/// SelectAddressEVXRegReg - Given the specified addressed, check to see if
	/// it can be more efficiently represented as [r+imm].
	bool SelectAddressEVXRegReg(SDValue N, SDValue &Base, SDValue &Index,
	SelectionDAG &DAG) const;

	/// SelectAddressRegReg - Given the specified addressed, check to see if it
	/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment
	/// is non-zero, only accept displacement which is not suitable for [r+imm].
	/// Returns false if it can be represented by [r+imm], which are preferred.
	bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
	SelectionDAG &DAG,
	unsigned EncodingAlignment = 0) const;

	/// SelectAddressRegImm - Returns true if the address N can be represented
	/// by a base register plus a signed 16-bit displacement [r+imm], and if it
	/// is not better represented as reg+reg. If \p EncodingAlignment is
	/// non-zero, only accept displacements suitable for instruction encoding
	/// requirement, i.e. multiples of 4 for DS form.
	bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
	SelectionDAG &DAG,
	unsigned EncodingAlignment) const;

	/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
	/// represented as an indexed [r+r] operation.
	bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
	SelectionDAG &DAG) const;

	Sched::Preference getSchedulingPreference(SDNode *N) const override;

	/// LowerOperation - Provide custom lowering hooks for some operations.
	///
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;

	/// ReplaceNodeResults - Replace the results of node with an illegal result
	/// type with new values built out of custom code.
	///
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const override;

	SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const override;

	Register getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const override;

	void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	Align getPrefLoopAlignment(MachineLoop *ML) const override;

	bool shouldInsertFencesForAtomic(const Instruction *I) const override {
	return true;
	}

	Instruction emitLeadingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;
	Instruction emitTrailingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const override;
	MachineBasicBlock *EmitAtomicBinary(MachineInstr &MI,
	MachineBasicBlock *MBB,
	unsigned AtomicSize,
	unsigned BinOpcode,
	unsigned CmpOpcode = 0,
	unsigned CmpPred = 0) const;
	MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr &MI,
	MachineBasicBlock *MBB,
	bool is8bit,
	unsigned Opcode,
	unsigned CmpOpcode = 0,
	unsigned CmpPred = 0) const;

	MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	ConstraintType getConstraintType(StringRef Constraint) const override;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	ConstraintWeight getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const override;

	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. This is the actual
	/// alignment, not its logarithm.
	unsigned getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const override;

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	unsigned
	getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
	if (ConstraintCode == "es")
	return InlineAsm::Constraint_es;
	else if (ConstraintCode == "o")
	return InlineAsm::Constraint_o;
	else if (ConstraintCode == "Q")
	return InlineAsm::Constraint_Q;
	else if (ConstraintCode == "Z")
	return InlineAsm::Constraint_Z;
	else if (ConstraintCode == "Zy")
	return InlineAsm::Constraint_Zy;
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS,
	Instruction *I = nullptr) const override;

	/// isLegalICmpImmediate - Return true if the specified immediate is legal
	/// icmp immediate, that is the target has icmp instructions which can
	/// compare a register against the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalICmpImmediate(int64_t Imm) const override;

	/// isLegalAddImmediate - Return true if the specified immediate is legal
	/// add immediate, that is the target has add instructions which can
	/// add a register and the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalAddImmediate(int64_t Imm) const override;

	/// isTruncateFree - Return true if it's free to truncate a value of
	/// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
	/// register X1 to i32 by referencing its sub-register R1.
	bool isTruncateFree(Type Ty1, Type Ty2) const override;
	bool isTruncateFree(EVT VT1, EVT VT2) const override;

	bool isZExtFree(SDValue Val, EVT VT2) const override;

	bool isFPExtFree(EVT DestVT, EVT SrcVT) const override;

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override;

	bool convertSelectOfConstantsToMath(EVT VT) const override {
	return true;
	}

	bool isDesirableToTransformToIntegerOp(unsigned Opc,
	EVT VT) const override {
	// Only handle float load/store pair because float(fpr) load/store
	// instruction has more cycles than integer(gpr) load/store in PPC.
	if (Opc != ISD::LOAD && Opc != ISD::STORE)
	return false;
	if (VT != MVT::f32 && VT != MVT::f64)
	return false;

	return true;
	}

	// Returns true if the address of the global is stored in TOC entry.
	bool isAccessedAsGotIndirect(SDValue N) const;

	bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;

	bool getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;

	/// getOptimalMemOpType - Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const override;

	/// Is unaligned memory access allowed for the given type, and is it fast
	/// relative to software emulation.
	bool allowsMisalignedMemoryAccesses(
	EVT VT, unsigned AddrSpace, unsigned Align = 1,
	MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
	bool *Fast = nullptr) const override;

	/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
	/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
	/// expanded to FMAs when this method returns true, otherwise fmuladd is
	/// expanded to fmul + fadd.
	bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const override;

	const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;

	// Should we expand the build vector with shuffles?
	bool
	shouldExpandBuildVectorWithShuffles(EVT VT,
	unsigned DefinedValues) const override;

	/// createFastISel - This method returns a target-specific FastISel object,
	/// or null if the target does not support "fast" instruction selection.
	FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo) const override;

	/// Returns true if an argument of type Ty needs to be passed in a
	/// contiguous block of registers in calling convention CallConv.
	bool functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
	// We support any array type as "consecutive" block in the parameter
	// save area. The element type defines the alignment requirement and
	// whether the argument should go in GPRs, FPRs, or VRs if available.
	//
	// Note that clang uses this capability both to implement the ELFv2
	// homogeneous float/vector aggregate ABI, and to avoid having to use
	// "byval" when passing aggregates that might fully fit in registers.
	return Ty->isArrayTy();
	}

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	/// Override to support customized stack guard loading.
	bool useLoadStackGuardNode() const override;
	void insertSSPDeclarations(Module &M) const override;

	bool isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const override;

	unsigned getJumpTableEncoding() const override;
	bool isJumpTableRelative() const override;
	SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const override;
	const MCExpr getPICJumpTableRelocBaseExpr(const MachineFunction MF,
	unsigned JTI,
	MCContext &Ctx) const override;

	private:
	struct ReuseLoadInfo {
	SDValue Ptr;
	SDValue Chain;
	SDValue ResChain;
	MachinePointerInfo MPI;
	bool IsDereferenceable = false;
	bool IsInvariant = false;
	unsigned Alignment = 0;
	AAMDNodes AAInfo;
	const MDNode *Ranges = nullptr;

	ReuseLoadInfo() = default;

	MachineMemOperand::Flags MMOFlags() const {
	MachineMemOperand::Flags F = MachineMemOperand::MONone;
	if (IsDereferenceable)
	F \|= MachineMemOperand::MODereferenceable;
	if (IsInvariant)
	F \|= MachineMemOperand::MOInvariant;
	return F;
	}
	};

	bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
	// Addrspacecasts are always noops.
	return true;
	}

	bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
	void spliceIntoChain(SDValue ResChain, SDValue NewResChain,
	SelectionDAG &DAG) const;

	void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
	SelectionDAG &DAG, const SDLoc &dl) const;
	SDValue LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;

	bool directMoveIsProfitable(const SDValue &Op) const;
	SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;

	SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;

	SDValue LowerTRUNCATEVector(SDValue Op, SelectionDAG &DAG) const;

	SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
	SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;

	bool
	IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;

	bool
	IsEligibleForTailCallOptimization_64SVR4(
	SDValue Callee,
	CallingConv::ID CalleeCC,
	ImmutableCallSite CS,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;

	SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG &DAG, int SPDiff,
	SDValue Chain, SDValue &LROpOut,
	SDValue &FPOpOut,
	const SDLoc &dl) const;

	SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, SDValue GA) const;

	SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;
	SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
	CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const;
	SDValue FinishCall(CallingConv::ID CallConv, const SDLoc &dl,
	bool isTailCall, bool isVarArg, bool isPatchPoint,
	bool hasNest, SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
	SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
	SDValue &Callee, int SPDiff, unsigned NumBytes,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;

	SDValue
	LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;

	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const override;

	SDValue extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
	SelectionDAG &DAG, SDValue ArgVal,
	const SDLoc &dl) const;

	SDValue LowerFormalArguments_AIX(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
	SDValue LowerFormalArguments_Darwin(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
	SDValue LowerFormalArguments_64SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
	SDValue LowerFormalArguments_32SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;

	SDValue createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
	SDValue CallSeqStart,
	ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
	const SDLoc &dl) const;

	SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;
	SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;
	SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;
	SDValue LowerCall_AIX(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;

	SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;

	SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineVReverseMemOP(ShuffleVectorSDNode SVN, LSBaseSDNode LSBase,
	DAGCombinerInfo &DCI) const;

	/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
	/// SETCC with integer subtraction when (1) there is a legal way of doing it
	/// (2) keeping the result of comparison in GPR has performance benefit.
	SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const;

	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps, bool &UseOneConstNR,
	bool Reciprocal) const override;
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const override;
	unsigned combineRepeatedFPDivisors() const override;

	SDValue
	combineElementTruncationToVectorTruncation(SDNode *N,
	DAGCombinerInfo &DCI) const;

	/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
	/// handled by the VINSERTH instruction introduced in ISA 3.0. This is
	/// essentially any shuffle of v8i16 vectors that just inserts one element
	/// from one vector into the other.
	SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;

	/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be
	/// handled by the VINSERTB instruction introduced in ISA 3.0. This is
	/// essentially v16i8 vector version of VINSERTH.
	SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;

	// Return whether the call instruction can potentially be optimized to a
	// tail call. This will cause the optimizers to attempt to move, or
	// duplicate return instructions to help enable tail call optimizations.
	bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
	bool hasBitPreservingFPLogic(EVT VT) const override;
	bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
	}; // end class PPCTargetLowering

	namespace PPC {

	FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo);

	} // end namespace PPC

	bool isIntS16Immediate(SDNode *N, int16_t &Imm);
	bool isIntS16Immediate(SDValue Op, int16_t &Imm);

	} // end namespace llvm

	#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td (revision 362609)
	@@ -1,1587 +1,1591 @@
	//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the Altivec extension to the PowerPC instruction set.
	//
	//===----------------------------------------------------------------------===//

	// ********************************* NOTE *********************************
	// For POWER8 Little Endian, the VSX swap optimization relies on knowing
	// which VMX and VSX instructions are lane-sensitive and which are not.
	// A lane-sensitive instruction relies, implicitly or explicitly, on
	// whether lanes are numbered from left to right. An instruction like
	// VADDFP is not lane-sensitive, because each lane of the result vector
	// relies only on the corresponding lane of the source vectors. However,
	// an instruction like VMULESB is lane-sensitive, because "even" and
	// "odd" lanes are different for big-endian and little-endian numbering.
	//
	// When adding new VMX and VSX instructions, please consider whether they
	// are lane-sensitive. If so, they must be added to a switch statement
	// in PPCVSXSwapRemoval::gatherVectorInstructions().
	// ****************************************************************************


	//===----------------------------------------------------------------------===//
	// Altivec transformation functions and pattern fragments.
	//

	// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
	// of that type.
	def vnot_ppc : PatFrag<(ops node:$in),
	(xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;

	def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
	}]>;
	def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
	}]>;
	def vpkudum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
	}]>;
	def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
	}]>;
	def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
	}]>;
	def vpkudum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
	}]>;

	// These fragments are provided for little-endian, where the inputs must be
	// swapped for correct semantics.
	def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
	}]>;
	def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
	}]>;
	def vpkudum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
	}]>;

	def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
	}]>;
	def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
	}]>;
	def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
	}]>;
	def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
	}]>;
	def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
	}]>;
	def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
	}]>;


	def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
	}]>;
	def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
	}]>;
	def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
	}]>;
	def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
	}]>;
	def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
	}]>;
	def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
	}]>;


	// These fragments are provided for little-endian, where the inputs must be
	// swapped for correct semantics.
	def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
	}]>;
	def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
	}]>;
	def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
	}]>;
	def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
	}]>;
	def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
	}]>;
	def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
	}]>;


	def vmrgew_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 0, *CurDAG);
	}]>;
	def vmrgow_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 0, *CurDAG);
	}]>;
	def vmrgew_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 1, *CurDAG);
	}]>;
	def vmrgow_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 1, *CurDAG);
	}]>;
	def vmrgew_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 2, *CurDAG);
	}]>;
	def vmrgow_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 2, *CurDAG);
	}]>;



	def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
	return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG), SDLoc(N));
	}]>;
	def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1;
	}], VSLDOI_get_imm>;


	/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
	/// vector_shuffle(X,undef,mask) by the dag combiner.
	def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
	return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG), SDLoc(N));
	}]>;
	def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1;
	}], VSLDOI_unary_get_imm>;


	/// VSLDOI_swapped* - These fragments are provided for little-endian, where
	/// the inputs must be swapped for correct semantics.
	def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{
	return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG), SDLoc(N));
	}]>;
	def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1;
	}], VSLDOI_get_imm>;


	// VSPLT_get_imm xform function: convert vector_shuffle mask to VSPLT imm.
	def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
	return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N));
	}]>;
	def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
	}], VSPLTB_get_imm>;
	def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
	return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N));
	}]>;
	def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
	}], VSPLTH_get_imm>;
	def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
	return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N));
	}]>;
	def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
	(vector_shuffle node:$lhs, node:$rhs), [{
	return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
	}], VSPLTW_get_imm>;


	// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
	def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
	return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
	}]>;
	def vecspltisb : PatLeaf<(build_vector), [{
	return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != nullptr;
	}], VSPLTISB_get_imm>;

	// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
	def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
	return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
	}]>;
	def vecspltish : PatLeaf<(build_vector), [{
	return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != nullptr;
	}], VSPLTISH_get_imm>;

	// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
	def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
	return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
	}]>;
	def vecspltisw : PatLeaf<(build_vector), [{
	return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != nullptr;
	}], VSPLTISW_get_imm>;

	def immEQOneV : PatLeaf<(build_vector), [{
	if (ConstantSDNode *C = cast<BuildVectorSDNode>(N)->getConstantSplatNode())
	return C->isOne();
	return false;
	}]>;
	//===----------------------------------------------------------------------===//
	// Helpers for defining instructions that directly correspond to intrinsics.

	// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type.
	class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
	: VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
	!strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
	[(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>;

	// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the
	// inputs doesn't match the type of the output.
	class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
	ValueType InTy>
	: VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
	!strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
	[(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>;

	// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two
	// input types and an output type.
	class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
	ValueType In1Ty, ValueType In2Ty>
	: VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
	!strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
	[(set OutTy:$vD,
	(IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>;

	// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type.
	class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
	: VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
	[(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>;

	// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the
	// inputs doesn't match the type of the output.
	class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
	ValueType InTy>
	: VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
	[(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>;

	// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two
	// input types and an output type.
	class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
	ValueType In1Ty, ValueType In2Ty>
	: VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
	[(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>;

	// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type.
	class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
	: VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
	!strconcat(opc, " $vD, $vB"), IIC_VecFP,
	[(set v4f32:$vD, (IntID v4f32:$vB))]>;

	// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the
	// inputs doesn't match the type of the output.
	class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
	ValueType InTy>
	: VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
	!strconcat(opc, " $vD, $vB"), IIC_VecFP,
	[(set OutTy:$vD, (IntID InTy:$vB))]>;

	class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
	: VXForm_BX<xo, (outs vrrc:$vD), (ins vrrc:$vA),
	!strconcat(opc, " $vD, $vA"), IIC_VecFP,
	[(set Ty:$vD, (IntID Ty:$vA))]>;

	class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
	: VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX),
	!strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP,
	[(set Ty:$vD, (IntID Ty:$vA, timm:$ST, timm:$SIX))]>;

	//===----------------------------------------------------------------------===//
	// Instruction Definitions.

	def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
	let Predicates = [HasAltivec] in {

	def DSS : DSS_Form<0, 822, (outs), (ins u5imm:$STRM),
	"dss $STRM", IIC_LdStLoad /FIXME/, [(int_ppc_altivec_dss imm:$STRM)]>,
	Deprecated<DeprecatedDST> {
	let A = 0;
	let B = 0;
	}

	def DSSALL : DSS_Form<1, 822, (outs), (ins),
	"dssall", IIC_LdStLoad /FIXME/, [(int_ppc_altivec_dssall)]>,
	Deprecated<DeprecatedDST> {
	let STRM = 0;
	let A = 0;
	let B = 0;
	}

	def DST : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
	"dst $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	def DSTT : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
	"dstt $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	def DSTST : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
	"dstst $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	def DSTSTT : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
	"dststt $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	let isCodeGenOnly = 1 in {
	// The very same instructions as above, but formally matching 64bit registers.
	def DST64 : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
	"dst $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	def DSTT64 : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
	"dstt $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	def DSTST64 : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
	"dstst $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dstst i64:$rA, i32:$rB,
	imm:$STRM)]>,
	Deprecated<DeprecatedDST>;

	def DSTSTT64 : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
	"dststt $rA, $rB, $STRM", IIC_LdStLoad /FIXME/,
	[(int_ppc_altivec_dststt i64:$rA, i32:$rB,
	imm:$STRM)]>,
	Deprecated<DeprecatedDST>;
	}

	def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
	"mfvscr $vD", IIC_LdStStore,
	[(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
	def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
	"mtvscr $vB", IIC_LdStLoad,
	[(int_ppc_altivec_mtvscr v4i32:$vB)]>;

	let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads.
	def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src),
	"lvebx $vD, $src", IIC_LdStLoad,
	[(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
	def LVEHX: XForm_1_memOp<31, 39, (outs vrrc:$vD), (ins memrr:$src),
	"lvehx $vD, $src", IIC_LdStLoad,
	[(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
	def LVEWX: XForm_1_memOp<31, 71, (outs vrrc:$vD), (ins memrr:$src),
	"lvewx $vD, $src", IIC_LdStLoad,
	[(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
	def LVX : XForm_1_memOp<31, 103, (outs vrrc:$vD), (ins memrr:$src),
	"lvx $vD, $src", IIC_LdStLoad,
	[(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
	def LVXL : XForm_1_memOp<31, 359, (outs vrrc:$vD), (ins memrr:$src),
	"lvxl $vD, $src", IIC_LdStLoad,
	[(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
	}

	def LVSL : XForm_1_memOp<31, 6, (outs vrrc:$vD), (ins memrr:$src),
	"lvsl $vD, $src", IIC_LdStLoad,
	[(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
	PPC970_Unit_LSU;
	def LVSR : XForm_1_memOp<31, 38, (outs vrrc:$vD), (ins memrr:$src),
	"lvsr $vD, $src", IIC_LdStLoad,
	[(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
	PPC970_Unit_LSU;

	let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { // Stores.
	def STVEBX: XForm_8_memOp<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
	"stvebx $rS, $dst", IIC_LdStStore,
	[(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
	def STVEHX: XForm_8_memOp<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
	"stvehx $rS, $dst", IIC_LdStStore,
	[(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
	def STVEWX: XForm_8_memOp<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
	"stvewx $rS, $dst", IIC_LdStStore,
	[(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
	def STVX : XForm_8_memOp<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
	"stvx $rS, $dst", IIC_LdStStore,
	[(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
	def STVXL : XForm_8_memOp<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
	"stvxl $rS, $dst", IIC_LdStStore,
	[(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
	}

	let PPC970_Unit = 5 in { // VALU Operations.
	// VA-Form instructions. 3-input AltiVec ops.
	let isCommutable = 1 in {
	def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
	"vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP,
	[(set v4f32:$vD,
	(fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>;

	// FIXME: The fma+fneg pattern won't match because fneg is not legal.
	def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
	"vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
	[(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
	(fneg v4f32:$vB))))]>;

	def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
	def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
	v8i16>;
	def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
	} // isCommutable

	def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
	v4i32, v4i32, v16i8>;
	def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>;

	// Shuffles.
	def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u4imm:$SH),
	"vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
	[(set v16i8:$vD,
	(PPCvecshl v16i8:$vA, v16i8:$vB, imm32SExt16:$SH))]>;

	// VX-Form instructions. AltiVec arithmetic ops.
	let isCommutable = 1 in {
	def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vaddfp $vD, $vA, $vB", IIC_VecFP,
	[(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;

	def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vaddubm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
	def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vadduhm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>;
	def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vadduwm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;

	def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
	def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>;
	def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>;
	def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>;
	def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>;
	def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>;
	def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>;
	} // isCommutable

	let isCommutable = 1 in
	def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vand $vD, $vA, $vB", IIC_VecFP,
	[(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>;
	def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vandc $vD, $vA, $vB", IIC_VecFP,
	[(set v4i32:$vD, (and v4i32:$vA,
	(vnot_ppc v4i32:$vB)))]>;

	def VCFSX : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vcfsx $vD, $vB, $UIMM", IIC_VecFP,
	[(set v4f32:$vD,
	(int_ppc_altivec_vcfsx v4i32:$vB, timm:$UIMM))]>;
	def VCFUX : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vcfux $vD, $vB, $UIMM", IIC_VecFP,
	[(set v4f32:$vD,
	(int_ppc_altivec_vcfux v4i32:$vB, timm:$UIMM))]>;
	def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vctsxs $vD, $vB, $UIMM", IIC_VecFP,
	[(set v4i32:$vD,
	(int_ppc_altivec_vctsxs v4f32:$vB, timm:$UIMM))]>;
	def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vctuxs $vD, $vB, $UIMM", IIC_VecFP,
	[(set v4i32:$vD,
	(int_ppc_altivec_vctuxs v4f32:$vB, timm:$UIMM))]>;

	// Defines with the UIM field set to 0 for floating-point
	// to integer (fp_to_sint/fp_to_uint) conversions and integer
	// to floating-point (sint_to_fp/uint_to_fp) conversions.
	let isCodeGenOnly = 1, VA = 0 in {
	def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB),
	"vcfsx $vD, $vB, 0", IIC_VecFP,
	[(set v4f32:$vD,
	(int_ppc_altivec_vcfsx v4i32:$vB, 0))]>;
	def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB),
	"vctuxs $vD, $vB, 0", IIC_VecFP,
	[(set v4i32:$vD,
	(int_ppc_altivec_vctuxs v4f32:$vB, 0))]>;
	def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB),
	"vcfux $vD, $vB, 0", IIC_VecFP,
	[(set v4f32:$vD,
	(int_ppc_altivec_vcfux v4i32:$vB, 0))]>;
	def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB),
	"vctsxs $vD, $vB, 0", IIC_VecFP,
	[(set v4i32:$vD,
	(int_ppc_altivec_vctsxs v4f32:$vB, 0))]>;
	}
	def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>;
	def VLOGEFP : VX2_Int_SP<458, "vlogefp", int_ppc_altivec_vlogefp>;

	let isCommutable = 1 in {
	def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>;
	def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>;
	def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>;
	def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>;
	def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>;
	def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>;

	def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>;
	def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>;
	def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>;
	def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>;
	def VMAXUB : VX1_Int_Ty< 2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>;
	def VMAXUH : VX1_Int_Ty< 66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>;
	def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>;
	def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>;
	def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>;
	def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>;
	def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>;
	def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>;
	def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>;
	def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>;
	} // isCommutable

	def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrghb $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrghh $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrghw $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrglb $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrglh $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrglw $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>;

	def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
	v4i32, v16i8, v4i32>;
	def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
	v4i32, v8i16, v4i32>;
	def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
	v4i32, v8i16, v4i32>;
	def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
	v4i32, v16i8, v4i32>;
	def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
	v4i32, v8i16, v4i32>;
	def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
	v4i32, v8i16, v4i32>;

	let isCommutable = 1 in {
	def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
	v8i16, v16i8>;
	def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh,
	v4i32, v8i16>;
	def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub,
	v8i16, v16i8>;
	def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh,
	v4i32, v8i16>;
	def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb,
	v8i16, v16i8>;
	def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh,
	v4i32, v8i16>;
	def VMULOUB : VX1_Int_Ty2< 8, "vmuloub", int_ppc_altivec_vmuloub,
	v8i16, v16i8>;
	def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
	v4i32, v8i16>;
	} // isCommutable

	def VREFP : VX2_Int_SP<266, "vrefp", int_ppc_altivec_vrefp>;
	def VRFIM : VX2_Int_SP<714, "vrfim", int_ppc_altivec_vrfim>;
	def VRFIN : VX2_Int_SP<522, "vrfin", int_ppc_altivec_vrfin>;
	def VRFIP : VX2_Int_SP<650, "vrfip", int_ppc_altivec_vrfip>;
	def VRFIZ : VX2_Int_SP<586, "vrfiz", int_ppc_altivec_vrfiz>;
	def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;

	def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>;

	def VSUBFP : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsubfp $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>;
	def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsububm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>;
	def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsubuhm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>;
	def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsubuwm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;

	def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
	def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>;
	def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>;
	def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
	def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
	def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;

	def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
	def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;

	def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
	v4i32, v16i8, v4i32>;
	def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
	v4i32, v8i16, v4i32>;
	def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
	v4i32, v16i8, v4i32>;

	def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vnor $vD, $vA, $vB", IIC_VecFP,
	[(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
	v4i32:$vB)))]>;
	let isCommutable = 1 in {
	def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vor $vD, $vA, $vB", IIC_VecFP,
	[(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>;
	def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vxor $vD, $vA, $vB", IIC_VecFP,
	[(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>;
	} // isCommutable

	def VRLB : VX1_Int_Ty< 4, "vrlb", int_ppc_altivec_vrlb, v16i8>;
	def VRLH : VX1_Int_Ty< 68, "vrlh", int_ppc_altivec_vrlh, v8i16>;
	def VRLW : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>;

	def VSL : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl, v4i32 >;
	def VSLO : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>;

	def VSLB : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>;
	def VSLH : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>;
	def VSLW : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>;

	def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vspltb $vD, $vB, $UIMM", IIC_VecPerm,
	[(set v16i8:$vD,
	(vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>;
	def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vsplth $vD, $vB, $UIMM", IIC_VecPerm,
	[(set v16i8:$vD,
	(vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
	def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
	"vspltw $vD, $vB, $UIMM", IIC_VecPerm,
	[(set v16i8:$vD,
	(vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
	let isCodeGenOnly = 1, hasSideEffects = 0 in {
	def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
	"vspltb $vD, $vB, $UIMM", IIC_VecPerm, []>;
	def VSPLTHs : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
	"vsplth $vD, $vB, $UIMM", IIC_VecPerm, []>;
	}

	def VSR : VX1_Int_Ty< 708, "vsr" , int_ppc_altivec_vsr, v4i32>;
	def VSRO : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>;

	def VSRAB : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>;
	def VSRAH : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>;
	def VSRAW : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>;
	def VSRB : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>;
	def VSRH : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>;
	def VSRW : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>;


	def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM),
	"vspltisb $vD, $SIMM", IIC_VecPerm,
	[(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>;
	def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM),
	"vspltish $vD, $SIMM", IIC_VecPerm,
	[(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>;
	def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
	"vspltisw $vD, $SIMM", IIC_VecPerm,
	[(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>;

	// Vector Pack.
	def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
	v8i16, v4i32>;
	def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
	v16i8, v8i16>;
	def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
	v16i8, v8i16>;
	def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
	v8i16, v4i32>;
	def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
	v8i16, v4i32>;
	def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vpkuhum $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD,
	(vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
	v16i8, v8i16>;
	def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vpkuwum $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD,
	(vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
	v8i16, v4i32>;

	// Vector Unpack.
	def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
	v4i32, v8i16>;
	def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb,
	v8i16, v16i8>;
	def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh,
	v4i32, v8i16>;
	def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx,
	v4i32, v8i16>;
	def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb,
	v8i16, v16i8>;
	def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh,
	v4i32, v8i16>;


	// Altivec Comparisons.

	class VCMP<bits<10> xo, string asmstr, ValueType Ty>
	: VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
	IIC_VecFPCompare,
	[(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
	class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
	: VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
	IIC_VecFPCompare,
	[(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
	let Defs = [CR6];
	let RC = 1;
	}

	// f32 element comparisons.0
	def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>;
	def VCMPBFP_rec : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
	def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
	def VCMPEQFP_rec : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
	def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
	def VCMPGEFP_rec : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
	def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
	def VCMPGTFP_rec : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;

	// i8 element comparisons.
	def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>;
	def VCMPEQUB_rec : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>;
	def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
	def VCMPGTSB_rec : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
	def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
	def VCMPGTUB_rec : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;

	// i16 element comparisons.
	def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
	def VCMPEQUH_rec : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
	def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
	def VCMPGTSH_rec : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
	def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
	def VCMPGTUH_rec : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;

	// i32 element comparisons.
	def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
	def VCMPEQUW_rec : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
	def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
	def VCMPGTSW_rec : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
	def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
	def VCMPGTUW_rec : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;

	let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
	isReMaterializable = 1 in {

	def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
	"vxor $vD, $vD, $vD", IIC_VecFP,
	[(set v16i8:$vD, (v16i8 immAllZerosV))]>;
	def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
	"vxor $vD, $vD, $vD", IIC_VecFP,
	[(set v8i16:$vD, (v8i16 immAllZerosV))]>;
	def V_SET0 : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
	"vxor $vD, $vD, $vD", IIC_VecFP,
	[(set v4i32:$vD, (v4i32 immAllZerosV))]>;

	let IMM=-1 in {
	def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins),
	"vspltisw $vD, -1", IIC_VecFP,
	[(set v16i8:$vD, (v16i8 immAllOnesV))]>;
	def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins),
	"vspltisw $vD, -1", IIC_VecFP,
	[(set v8i16:$vD, (v8i16 immAllOnesV))]>;
	def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins),
	"vspltisw $vD, -1", IIC_VecFP,
	[(set v4i32:$vD, (v4i32 immAllOnesV))]>;
	}
	}
	} // VALU Operations.

	//===----------------------------------------------------------------------===//
	// Additional Altivec Patterns
	//

	// Extended mnemonics
	def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
	def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;

	// Rotates.
	def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)),
	(v16i8 (VRLB v16i8:$vA, v16i8:$vB))>;
	def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)),
	(v8i16 (VRLH v8i16:$vA, v8i16:$vB))>;
	def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)),
	(v4i32 (VRLW v4i32:$vA, v4i32:$vB))>;

	// Loads.
	def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;

	// Stores.
	def : Pat<(store v4i32:$rS, xoaddr:$dst),
	(STVX $rS, xoaddr:$dst)>;

	// Bit conversions.
	def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
	def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
	def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
	def : Pat<(v16i8 (bitconvert (v2i64 VRRC:$src))), (v16i8 VRRC:$src)>;
	def : Pat<(v16i8 (bitconvert (v1i128 VRRC:$src))), (v16i8 VRRC:$src)>;

	def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
	def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
	def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
	def : Pat<(v8i16 (bitconvert (v2i64 VRRC:$src))), (v8i16 VRRC:$src)>;
	def : Pat<(v8i16 (bitconvert (v1i128 VRRC:$src))), (v8i16 VRRC:$src)>;

	def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
	def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
	def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
	def : Pat<(v4i32 (bitconvert (v2i64 VRRC:$src))), (v4i32 VRRC:$src)>;
	def : Pat<(v4i32 (bitconvert (v1i128 VRRC:$src))), (v4i32 VRRC:$src)>;

	def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
	def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
	def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
	def : Pat<(v4f32 (bitconvert (v2i64 VRRC:$src))), (v4f32 VRRC:$src)>;
	def : Pat<(v4f32 (bitconvert (v1i128 VRRC:$src))), (v4f32 VRRC:$src)>;

	def : Pat<(v2i64 (bitconvert (v16i8 VRRC:$src))), (v2i64 VRRC:$src)>;
	def : Pat<(v2i64 (bitconvert (v8i16 VRRC:$src))), (v2i64 VRRC:$src)>;
	def : Pat<(v2i64 (bitconvert (v4i32 VRRC:$src))), (v2i64 VRRC:$src)>;
	def : Pat<(v2i64 (bitconvert (v4f32 VRRC:$src))), (v2i64 VRRC:$src)>;
	def : Pat<(v2i64 (bitconvert (v1i128 VRRC:$src))), (v2i64 VRRC:$src)>;

	def : Pat<(v1i128 (bitconvert (v16i8 VRRC:$src))), (v1i128 VRRC:$src)>;
	def : Pat<(v1i128 (bitconvert (v8i16 VRRC:$src))), (v1i128 VRRC:$src)>;
	def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
	def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
	def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;

	// Max/Min
	def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)),
	(v16i8 (VMAXUB $src1, $src2))>;
	def : Pat<(v16i8 (smax v16i8:$src1, v16i8:$src2)),
	(v16i8 (VMAXSB $src1, $src2))>;
	def : Pat<(v8i16 (umax v8i16:$src1, v8i16:$src2)),
	(v8i16 (VMAXUH $src1, $src2))>;
	def : Pat<(v8i16 (smax v8i16:$src1, v8i16:$src2)),
	(v8i16 (VMAXSH $src1, $src2))>;
	def : Pat<(v4i32 (umax v4i32:$src1, v4i32:$src2)),
	(v4i32 (VMAXUW $src1, $src2))>;
	def : Pat<(v4i32 (smax v4i32:$src1, v4i32:$src2)),
	(v4i32 (VMAXSW $src1, $src2))>;
	def : Pat<(v16i8 (umin v16i8:$src1, v16i8:$src2)),
	(v16i8 (VMINUB $src1, $src2))>;
	def : Pat<(v16i8 (smin v16i8:$src1, v16i8:$src2)),
	(v16i8 (VMINSB $src1, $src2))>;
	def : Pat<(v8i16 (umin v8i16:$src1, v8i16:$src2)),
	(v8i16 (VMINUH $src1, $src2))>;
	def : Pat<(v8i16 (smin v8i16:$src1, v8i16:$src2)),
	(v8i16 (VMINSH $src1, $src2))>;
	def : Pat<(v4i32 (umin v4i32:$src1, v4i32:$src2)),
	(v4i32 (VMINUW $src1, $src2))>;
	def : Pat<(v4i32 (smin v4i32:$src1, v4i32:$src2)),
	(v4i32 (VMINSW $src1, $src2))>;

	// Shuffles.

	// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
	def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef),
	(VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>;
	def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
	(VPKUWUM $vA, $vA)>;
	def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
	(VPKUHUM $vA, $vA)>;
	def:Pat<(vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB),
	(VSLDOI v16i8:$vA, v16i8:$vB, (VSLDOI_get_imm $SH))>;


	// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
	// These fragments are matched for little-endian, where the inputs must
	// be swapped for correct semantics.
	def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB),
	(VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>;
	def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VPKUWUM $vB, $vA)>;
	def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VPKUHUM $vB, $vA)>;

	// Match vmrg*(x,x)
	def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef),
	(VMRGLB $vA, $vA)>;
	def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef),
	(VMRGLH $vA, $vA)>;
	def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef),
	(VMRGLW $vA, $vA)>;
	def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef),
	(VMRGHB $vA, $vA)>;
	def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef),
	(VMRGHH $vA, $vA)>;
	def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
	(VMRGHW $vA, $vA)>;

	// Match vmrg*(y,x), i.e., swapped operands. These fragments
	// are matched for little-endian, where the inputs must be
	// swapped for correct semantics.
	def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGLB $vB, $vA)>;
	def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGLH $vB, $vA)>;
	def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGLW $vB, $vA)>;
	def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGHB $vB, $vA)>;
	def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGHH $vB, $vA)>;
	def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGHW $vB, $vA)>;

	// Logical Operations
	def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;

	def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)),
	(VNOR $A, $B)>;
	def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
	(VANDC $A, $B)>;

	def : Pat<(fmul v4f32:$vA, v4f32:$vB),
	(VMADDFP $vA, $vB,
	(v4i32 (VSLW (v4i32 (V_SETALLONES)), (v4i32 (V_SETALLONES)))))>;

	// Fused multiply add and multiply sub for packed float. These are represented
	// separately from the real instructions above, for operations that must have
	// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
	def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
	(VMADDFP $A, $B, $C)>;
	def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
	(VNMSUBFP $A, $B, $C)>;

	def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
	(VMADDFP $A, $B, $C)>;
	def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
	(VNMSUBFP $A, $B, $C)>;

	def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC),
	(VPERM $vA, $vB, $vC)>;

	def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
	def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;

	// Vector shifts
	def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
	(v16i8 (VSLB $vA, $vB))>;
	def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
	(v8i16 (VSLH $vA, $vB))>;
	def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
	(v4i32 (VSLW $vA, $vB))>;
	def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)),
	(v1i128 (VSL (v16i8 (VSLO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
	def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
	(v16i8 (VSLB $vA, $vB))>;
	def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
	(v8i16 (VSLH $vA, $vB))>;
	def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
	(v4i32 (VSLW $vA, $vB))>;
	def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)),
	(v1i128 (VSL (v16i8 (VSLO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;

	def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
	(v16i8 (VSRB $vA, $vB))>;
	def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
	(v8i16 (VSRH $vA, $vB))>;
	def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
	(v4i32 (VSRW $vA, $vB))>;
	def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)),
	(v1i128 (VSR (v16i8 (VSRO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;
	def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
	(v16i8 (VSRB $vA, $vB))>;
	def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
	(v8i16 (VSRH $vA, $vB))>;
	def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
	(v4i32 (VSRW $vA, $vB))>;
	def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)),
	(v1i128 (VSR (v16i8 (VSRO $vA, $vB)), (v16i8 (VSPLTB 15, $vB))))>;

	def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
	(v16i8 (VSRAB $vA, $vB))>;
	def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
	(v8i16 (VSRAH $vA, $vB))>;
	def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
	(v4i32 (VSRAW $vA, $vB))>;
	def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)),
	(v16i8 (VSRAB $vA, $vB))>;
	def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)),
	(v8i16 (VSRAH $vA, $vB))>;
	def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)),
	(v4i32 (VSRAW $vA, $vB))>;

	// Float to integer and integer to float conversions
	def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
	(VCTSXS_0 $vA)>;
	def : Pat<(v4i32 (fp_to_uint v4f32:$vA)),
	(VCTUXS_0 $vA)>;
	def : Pat<(v4f32 (sint_to_fp v4i32:$vA)),
	(VCFSX_0 $vA)>;
	def : Pat<(v4f32 (uint_to_fp v4i32:$vA)),
	(VCFUX_0 $vA)>;

	// Floating-point rounding
	def : Pat<(v4f32 (ffloor v4f32:$vA)),
	(VRFIM $vA)>;
	def : Pat<(v4f32 (fceil v4f32:$vA)),
	(VRFIP $vA)>;
	def : Pat<(v4f32 (ftrunc v4f32:$vA)),
	(VRFIZ $vA)>;
	def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
	(VRFIN $vA)>;

	// Vector selection
	def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)),
	(VSEL $vC, $vB, $vA)>;
	def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)),
	(VSEL $vC, $vB, $vA)>;
	def : Pat<(v4i32 (vselect v4i32:$vA, v4i32:$vB, v4i32:$vC)),
	(VSEL $vC, $vB, $vA)>;
	def : Pat<(v2i64 (vselect v2i64:$vA, v2i64:$vB, v2i64:$vC)),
	(VSEL $vC, $vB, $vA)>;
	def : Pat<(v4f32 (vselect v4i32:$vA, v4f32:$vB, v4f32:$vC)),
	(VSEL $vC, $vB, $vA)>;
	def : Pat<(v2f64 (vselect v2i64:$vA, v2f64:$vB, v2f64:$vC)),
	(VSEL $vC, $vB, $vA)>;

	// Vector Integer Average Instructions
	def : Pat<(v4i32 (sra (sub v4i32:$vA, (vnot_ppc v4i32:$vB)),
	(v4i32 (immEQOneV)))), (v4i32 (VAVGSW $vA, $vB))>;
	def : Pat<(v8i16 (sra (sub v8i16:$vA, (v8i16 (bitconvert(vnot_ppc v4i32:$vB)))),
	(v8i16 (immEQOneV)))), (v8i16 (VAVGSH $vA, $vB))>;
	def : Pat<(v16i8 (sra (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))),
	(v16i8 (immEQOneV)))), (v16i8 (VAVGSB $vA, $vB))>;
	def : Pat<(v4i32 (srl (sub v4i32:$vA, (vnot_ppc v4i32:$vB)),
	(v4i32 (immEQOneV)))), (v4i32 (VAVGUW $vA, $vB))>;
	def : Pat<(v8i16 (srl (sub v8i16:$vA, (v8i16 (bitconvert(vnot_ppc v4i32:$vB)))),
	(v8i16 (immEQOneV)))), (v8i16 (VAVGUH $vA, $vB))>;
	def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))),
	(v16i8 (immEQOneV)))), (v16i8 (VAVGUB $vA, $vB))>;

	} // end HasAltivec

	def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">;
	def HasP8Crypto : Predicate<"PPCSubTarget->hasP8Crypto()">;
	let Predicates = [HasP8Altivec] in {

	let isCommutable = 1 in {
	def VMULESW : VX1_Int_Ty2<904, "vmulesw", int_ppc_altivec_vmulesw,
	v2i64, v4i32>;
	def VMULEUW : VX1_Int_Ty2<648, "vmuleuw", int_ppc_altivec_vmuleuw,
	v2i64, v4i32>;
	def VMULOSW : VX1_Int_Ty2<392, "vmulosw", int_ppc_altivec_vmulosw,
	v2i64, v4i32>;
	def VMULOUW : VX1_Int_Ty2<136, "vmulouw", int_ppc_altivec_vmulouw,
	v2i64, v4i32>;
	def VMULUWM : VXForm_1<137, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmuluwm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (mul v4i32:$vA, v4i32:$vB))]>;
	def VMAXSD : VX1_Int_Ty<450, "vmaxsd", int_ppc_altivec_vmaxsd, v2i64>;
	def VMAXUD : VX1_Int_Ty<194, "vmaxud", int_ppc_altivec_vmaxud, v2i64>;
	def VMINSD : VX1_Int_Ty<962, "vminsd", int_ppc_altivec_vminsd, v2i64>;
	def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>;
	} // isCommutable

	// Vector merge
	def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrgew $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD,
	(v16i8 (vmrgew_shuffle v16i8:$vA, v16i8:$vB)))]>;
	def VMRGOW : VXForm_1<1676, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vmrgow $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD,
	(v16i8 (vmrgow_shuffle v16i8:$vA, v16i8:$vB)))]>;

	// Match vmrgew(x,x) and vmrgow(x,x)
	def:Pat<(vmrgew_unary_shuffle v16i8:$vA, undef),
	(VMRGEW $vA, $vA)>;
	def:Pat<(vmrgow_unary_shuffle v16i8:$vA, undef),
	(VMRGOW $vA, $vA)>;

	// Match vmrgew(y,x) and vmrgow(y,x), i.e., swapped operands. These fragments
	// are matched for little-endian, where the inputs must be swapped for correct
	// semantics.w
	def:Pat<(vmrgew_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGEW $vB, $vA)>;
	def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VMRGOW $vB, $vA)>;

	// Vector rotates.
	def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;

	def : Pat<(v2i64 (rotl v2i64:$vA, v2i64:$vB)),
	(v2i64 (VRLD v2i64:$vA, v2i64:$vB))>;

	// Vector shifts
	def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsld $vD, $vA, $vB", IIC_VecGeneral, []>;
	def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsrd $vD, $vA, $vB", IIC_VecGeneral, []>;
	def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsrad $vD, $vA, $vB", IIC_VecGeneral, []>;

	def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)),
	(v2i64 (VSLD $vA, $vB))>;
	def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)),
	(v2i64 (VSLD $vA, $vB))>;
	def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)),
	(v2i64 (VSRD $vA, $vB))>;
	def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)),
	(v2i64 (VSRD $vA, $vB))>;
	def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)),
	(v2i64 (VSRAD $vA, $vB))>;
	def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)),
	(v2i64 (VSRAD $vA, $vB))>;

	// Vector Integer Arithmetic Instructions
	let isCommutable = 1 in {
	def VADDUDM : VXForm_1<192, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vaddudm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v2i64:$vD, (add v2i64:$vA, v2i64:$vB))]>;
	def VADDUQM : VXForm_1<256, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vadduqm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v1i128:$vD, (add v1i128:$vA, v1i128:$vB))]>;
	} // isCommutable

	// Vector Quadword Add
	def VADDEUQM : VA1a_Int_Ty<60, "vaddeuqm", int_ppc_altivec_vaddeuqm, v1i128>;
	def VADDCUQ : VX1_Int_Ty<320, "vaddcuq", int_ppc_altivec_vaddcuq, v1i128>;
	def VADDECUQ : VA1a_Int_Ty<61, "vaddecuq", int_ppc_altivec_vaddecuq, v1i128>;

	// Vector Doubleword Subtract
	def VSUBUDM : VXForm_1<1216, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsubudm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v2i64:$vD, (sub v2i64:$vA, v2i64:$vB))]>;

	// Vector Quadword Subtract
	def VSUBUQM : VXForm_1<1280, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vsubuqm $vD, $vA, $vB", IIC_VecGeneral,
	[(set v1i128:$vD, (sub v1i128:$vA, v1i128:$vB))]>;
	def VSUBEUQM : VA1a_Int_Ty<62, "vsubeuqm", int_ppc_altivec_vsubeuqm, v1i128>;
	def VSUBCUQ : VX1_Int_Ty<1344, "vsubcuq", int_ppc_altivec_vsubcuq, v1i128>;
	def VSUBECUQ : VA1a_Int_Ty<63, "vsubecuq", int_ppc_altivec_vsubecuq, v1i128>;

	// Count Leading Zeros
	def VCLZB : VXForm_2<1794, (outs vrrc:$vD), (ins vrrc:$vB),
	"vclzb $vD, $vB", IIC_VecGeneral,
	[(set v16i8:$vD, (ctlz v16i8:$vB))]>;
	def VCLZH : VXForm_2<1858, (outs vrrc:$vD), (ins vrrc:$vB),
	"vclzh $vD, $vB", IIC_VecGeneral,
	[(set v8i16:$vD, (ctlz v8i16:$vB))]>;
	def VCLZW : VXForm_2<1922, (outs vrrc:$vD), (ins vrrc:$vB),
	"vclzw $vD, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (ctlz v4i32:$vB))]>;
	def VCLZD : VXForm_2<1986, (outs vrrc:$vD), (ins vrrc:$vB),
	"vclzd $vD, $vB", IIC_VecGeneral,
	[(set v2i64:$vD, (ctlz v2i64:$vB))]>;

	// Population Count
	def VPOPCNTB : VXForm_2<1795, (outs vrrc:$vD), (ins vrrc:$vB),
	"vpopcntb $vD, $vB", IIC_VecGeneral,
	[(set v16i8:$vD, (ctpop v16i8:$vB))]>;
	def VPOPCNTH : VXForm_2<1859, (outs vrrc:$vD), (ins vrrc:$vB),
	"vpopcnth $vD, $vB", IIC_VecGeneral,
	[(set v8i16:$vD, (ctpop v8i16:$vB))]>;
	def VPOPCNTW : VXForm_2<1923, (outs vrrc:$vD), (ins vrrc:$vB),
	"vpopcntw $vD, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (ctpop v4i32:$vB))]>;
	def VPOPCNTD : VXForm_2<1987, (outs vrrc:$vD), (ins vrrc:$vB),
	"vpopcntd $vD, $vB", IIC_VecGeneral,
	[(set v2i64:$vD, (ctpop v2i64:$vB))]>;

	let isCommutable = 1 in {
	// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the
	// VSX equivalents. We need to fix this up at some point. Two possible
	// solutions for this problem:
	// 1. Disable Altivec patterns that compete with VSX patterns using the
	// !HasVSX predicate. This essentially favours VSX over Altivec, in
	// hopes of reducing register pressure (larger register set using VSX
	// instructions than VMX instructions)
	// 2. Employ a more disciplined use of AddedComplexity, which would provide
	// more fine-grained control than option 1. This would be beneficial
	// if we find situations where Altivec is really preferred over VSX.
	def VEQV : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"veqv $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>;
	def VNAND : VXForm_1<1412, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vnand $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (vnot_ppc (and v4i32:$vA, v4i32:$vB)))]>;
	} // isCommutable

	def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vorc $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (or v4i32:$vA,
	(vnot_ppc v4i32:$vB)))]>;

	// i64 element comparisons.
	def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>;
	def VCMPEQUD_rec : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
	def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>;
	def VCMPGTSD_rec : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
	def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>;
	def VCMPGTUD_rec : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;

	// The cryptography instructions that do not require Category:Vector.Crypto
	def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb",
	int_ppc_altivec_crypto_vpmsumb, v16i8>;
	def VPMSUMH : VX1_Int_Ty<1096, "vpmsumh",
	int_ppc_altivec_crypto_vpmsumh, v8i16>;
	def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw",
	int_ppc_altivec_crypto_vpmsumw, v4i32>;
	def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd",
	int_ppc_altivec_crypto_vpmsumd, v2i64>;
	def VPERMXOR : VA1a_Int_Ty<45, "vpermxor",
	int_ppc_altivec_crypto_vpermxor, v16i8>;

	// Vector doubleword integer pack and unpack.
	def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
	v4i32, v2i64>;
	def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
	v4i32, v2i64>;
	def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vpkudum $vD, $vA, $vB", IIC_VecFP,
	[(set v16i8:$vD,
	(vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>;
	def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
	v4i32, v2i64>;
	def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
	v2i64, v4i32>;
	def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
	v2i64, v4i32>;

	// Shuffle patterns for unary and swapped (LE) vector pack modulo.
	def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef),
	(VPKUDUM $vA, $vA)>;
	def:Pat<(vpkudum_swapped_shuffle v16i8:$vA, v16i8:$vB),
	(VPKUDUM $vB, $vA)>;

	def VGBBD : VX2_Int_Ty2<1292, "vgbbd", int_ppc_altivec_vgbbd, v16i8, v16i8>;
	def VBPERMQ : VX1_Int_Ty2<1356, "vbpermq", int_ppc_altivec_vbpermq,
	v2i64, v16i8>;
	} // end HasP8Altivec

	// Crypto instructions (from builtins)
	let Predicates = [HasP8Crypto] in {
	def VSHASIGMAW : VXCR_Int_Ty<1666, "vshasigmaw",
	int_ppc_altivec_crypto_vshasigmaw, v4i32>;
	def VSHASIGMAD : VXCR_Int_Ty<1730, "vshasigmad",
	int_ppc_altivec_crypto_vshasigmad, v2i64>;
	def VCIPHER : VX1_Int_Ty<1288, "vcipher", int_ppc_altivec_crypto_vcipher,
	v2i64>;
	def VCIPHERLAST : VX1_Int_Ty<1289, "vcipherlast",
	int_ppc_altivec_crypto_vcipherlast, v2i64>;
	def VNCIPHER : VX1_Int_Ty<1352, "vncipher",
	int_ppc_altivec_crypto_vncipher, v2i64>;
	def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast",
	int_ppc_altivec_crypto_vncipherlast, v2i64>;
	def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>;
	} // HasP8Crypto

	// The following altivec instructions were introduced in Power ISA 3.0
	def HasP9Altivec : Predicate<"PPCSubTarget->hasP9Altivec()">;
	let Predicates = [HasP9Altivec] in {

	+// Vector Multiply-Sum
	+def VMSUMUDM : VA1a_Int_Ty3<35, "vmsumudm", int_ppc_altivec_vmsumudm,
	+ v1i128, v2i64, v1i128>;
	+
	// i8 element comparisons.
	def VCMPNEB : VCMP < 7, "vcmpneb $vD, $vA, $vB" , v16i8>;
	def VCMPNEB_rec : VCMPo < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
	def VCMPNEZB : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>;
	def VCMPNEZB_rec : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;

	// i16 element comparisons.
	def VCMPNEH : VCMP < 71, "vcmpneh $vD, $vA, $vB" , v8i16>;
	def VCMPNEH_rec : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
	def VCMPNEZH : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>;
	def VCMPNEZH_rec : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;

	// i32 element comparisons.
	def VCMPNEW : VCMP <135, "vcmpnew $vD, $vA, $vB" , v4i32>;
	def VCMPNEW_rec : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
	def VCMPNEZW : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>;
	def VCMPNEZW_rec : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;

	// VX-Form: [PO VRT / UIM VRB XO].
	// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
	// "/ UIM" (1 + 4 bit)
	class VX1_VT5_UIM5_VB5<bits<11> xo, string opc, list<dag> pattern>
	: VXForm_1<xo, (outs vrrc:$vD), (ins u4imm:$UIMM, vrrc:$vB),
	!strconcat(opc, " $vD, $vB, $UIMM"), IIC_VecGeneral, pattern>;

	class VX1_RT5_RA5_VB5<bits<11> xo, string opc, list<dag> pattern>
	: VXForm_1<xo, (outs g8rc:$rD), (ins g8rc:$rA, vrrc:$vB),
	!strconcat(opc, " $rD, $rA, $vB"), IIC_VecGeneral, pattern>;

	// Vector Extract Unsigned
	def VEXTRACTUB : VX1_VT5_UIM5_VB5<525, "vextractub", []>;
	def VEXTRACTUH : VX1_VT5_UIM5_VB5<589, "vextractuh", []>;
	def VEXTRACTUW : VX1_VT5_UIM5_VB5<653, "vextractuw", []>;
	def VEXTRACTD : VX1_VT5_UIM5_VB5<717, "vextractd" , []>;

	// Vector Extract Unsigned Byte/Halfword/Word Left/Right-Indexed
	let hasSideEffects = 0 in {
	def VEXTUBLX : VX1_RT5_RA5_VB5<1549, "vextublx", []>;
	def VEXTUBRX : VX1_RT5_RA5_VB5<1805, "vextubrx", []>;
	def VEXTUHLX : VX1_RT5_RA5_VB5<1613, "vextuhlx", []>;
	def VEXTUHRX : VX1_RT5_RA5_VB5<1869, "vextuhrx", []>;
	def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
	def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
	}

	// Vector Insert Element Instructions
	def VINSERTB : VXForm_1<781, (outs vrrc:$vD),
	(ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
	"vinsertb $vD, $vB, $UIM", IIC_VecGeneral,
	[(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB,
	imm32SExt16:$UIM))]>,
	RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
	def VINSERTH : VXForm_1<845, (outs vrrc:$vD),
	(ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
	"vinserth $vD, $vB, $UIM", IIC_VecGeneral,
	[(set v8i16:$vD, (PPCvecinsert v8i16:$vDi, v8i16:$vB,
	imm32SExt16:$UIM))]>,
	RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
	def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
	def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;

	class VX_VT5_EO5_VB5<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
	: VXForm_RD5_XO5_RS5<xo, eo, (outs vrrc:$vD), (ins vrrc:$vB),
	!strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
	class VX_VT5_EO5_VB5s<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
	: VXForm_RD5_XO5_RS5<xo, eo, (outs vfrc:$vD), (ins vfrc:$vB),
	!strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;

	// Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]
	def VCLZLSBB : VXForm_RD5_XO5_RS5<1538, 0, (outs gprc:$rD), (ins vrrc:$vB),
	"vclzlsbb $rD, $vB", IIC_VecGeneral,
	[(set i32:$rD, (int_ppc_altivec_vclzlsbb
	v16i8:$vB))]>;
	def VCTZLSBB : VXForm_RD5_XO5_RS5<1538, 1, (outs gprc:$rD), (ins vrrc:$vB),
	"vctzlsbb $rD, $vB", IIC_VecGeneral,
	[(set i32:$rD, (int_ppc_altivec_vctzlsbb
	v16i8:$vB))]>;
	// Vector Count Trailing Zeros
	def VCTZB : VX_VT5_EO5_VB5<1538, 28, "vctzb",
	[(set v16i8:$vD, (cttz v16i8:$vB))]>;
	def VCTZH : VX_VT5_EO5_VB5<1538, 29, "vctzh",
	[(set v8i16:$vD, (cttz v8i16:$vB))]>;
	def VCTZW : VX_VT5_EO5_VB5<1538, 30, "vctzw",
	[(set v4i32:$vD, (cttz v4i32:$vB))]>;
	def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd",
	[(set v2i64:$vD, (cttz v2i64:$vB))]>;

	// Vector Extend Sign
	def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>;
	def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
	def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
	def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
	def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
	let isCodeGenOnly = 1 in {
	def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>;
	def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>;
	def VEXTSB2Ds : VX_VT5_EO5_VB5s<1538, 24, "vextsb2d", []>;
	def VEXTSH2Ds : VX_VT5_EO5_VB5s<1538, 25, "vextsh2d", []>;
	def VEXTSW2Ds : VX_VT5_EO5_VB5s<1538, 26, "vextsw2d", []>;
	}

	def : Pat<(v4i32 (sext_inreg v4i32:$VRB, v4i8)), (v4i32 (VEXTSB2W $VRB))>;
	def : Pat<(v4i32 (sext_inreg v4i32:$VRB, v4i16)), (v4i32 (VEXTSH2W $VRB))>;
	def : Pat<(v2i64 (sext_inreg v2i64:$VRB, v2i8)), (v2i64 (VEXTSB2D $VRB))>;
	def : Pat<(v2i64 (sext_inreg v2i64:$VRB, v2i16)), (v2i64 (VEXTSH2D $VRB))>;
	def : Pat<(v2i64 (sext_inreg v2i64:$VRB, v2i32)), (v2i64 (VEXTSW2D $VRB))>;

	// Vector Integer Negate
	def VNEGW : VX_VT5_EO5_VB5<1538, 6, "vnegw",
	[(set v4i32:$vD,
	(sub (v4i32 immAllZerosV), v4i32:$vB))]>;

	def VNEGD : VX_VT5_EO5_VB5<1538, 7, "vnegd",
	[(set v2i64:$vD,
	(sub (v2i64 (bitconvert (v4i32 immAllZerosV))),
	v2i64:$vB))]>;

	// Vector Parity Byte
	def VPRTYBW : VX_VT5_EO5_VB5<1538, 8, "vprtybw", [(set v4i32:$vD,
	(int_ppc_altivec_vprtybw v4i32:$vB))]>;
	def VPRTYBD : VX_VT5_EO5_VB5<1538, 9, "vprtybd", [(set v2i64:$vD,
	(int_ppc_altivec_vprtybd v2i64:$vB))]>;
	def VPRTYBQ : VX_VT5_EO5_VB5<1538, 10, "vprtybq", [(set v1i128:$vD,
	(int_ppc_altivec_vprtybq v1i128:$vB))]>;

	// Vector (Bit) Permute (Right-indexed)
	def VBPERMD : VXForm_1<1484, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vbpermd $vD, $vA, $vB", IIC_VecFP, []>;
	def VPERMR : VAForm_1a<59, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
	"vpermr $vD, $vA, $vB, $vC", IIC_VecFP, []>;

	class VX1_VT5_VA5_VB5<bits<11> xo, string opc, list<dag> pattern>
	: VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern>;

	// Vector Rotate Left Mask/Mask-Insert
	def VRLWNM : VX1_VT5_VA5_VB5<389, "vrlwnm",
	[(set v4i32:$vD,
	(int_ppc_altivec_vrlwnm v4i32:$vA,
	v4i32:$vB))]>;
	def VRLWMI : VXForm_1<133, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
	"vrlwmi $vD, $vA, $vB", IIC_VecFP,
	[(set v4i32:$vD,
	(int_ppc_altivec_vrlwmi v4i32:$vA, v4i32:$vB,
	v4i32:$vDi))]>,
	RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
	def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm",
	[(set v2i64:$vD,
	(int_ppc_altivec_vrldnm v2i64:$vA,
	v2i64:$vB))]>;
	def VRLDMI : VXForm_1<197, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
	"vrldmi $vD, $vA, $vB", IIC_VecFP,
	[(set v2i64:$vD,
	(int_ppc_altivec_vrldmi v2i64:$vA, v2i64:$vB,
	v2i64:$vDi))]>,
	RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;

	// Vector Shift Left/Right
	def VSLV : VX1_VT5_VA5_VB5<1860, "vslv",
	[(set v16i8 : $vD, (int_ppc_altivec_vslv v16i8 : $vA, v16i8 : $vB))]>;
	def VSRV : VX1_VT5_VA5_VB5<1796, "vsrv",
	[(set v16i8 : $vD, (int_ppc_altivec_vsrv v16i8 : $vA, v16i8 : $vB))]>;

	// Vector Multiply-by-10 (& Write Carry) Unsigned Quadword
	def VMUL10UQ : VXForm_BX<513, (outs vrrc:$vD), (ins vrrc:$vA),
	"vmul10uq $vD, $vA", IIC_VecFP, []>;
	def VMUL10CUQ : VXForm_BX< 1, (outs vrrc:$vD), (ins vrrc:$vA),
	"vmul10cuq $vD, $vA", IIC_VecFP, []>;

	// Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword
	def VMUL10EUQ : VX1_VT5_VA5_VB5<577, "vmul10euq" , []>;
	def VMUL10ECUQ : VX1_VT5_VA5_VB5< 65, "vmul10ecuq", []>;

	// Decimal Integer Format Conversion Instructions

	// [PO VRT EO VRB 1 PS XO], "_o" means CR6 is set.
	class VX_VT5_EO5_VB5_PS1_XO9_o<bits<5> eo, bits<9> xo, string opc,
	list<dag> pattern>
	: VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB, u1imm:$PS),
	!strconcat(opc, " $vD, $vB, $PS"), IIC_VecFP, pattern> {
	let Defs = [CR6];
	}

	// [PO VRT EO VRB 1 / XO]
	class VX_VT5_EO5_VB5_XO9_o<bits<5> eo, bits<9> xo, string opc,
	list<dag> pattern>
	: VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB),
	!strconcat(opc, " $vD, $vB"), IIC_VecFP, pattern> {
	let Defs = [CR6];
	let PS = 0;
	}

	// Decimal Convert From/to National/Zoned/Signed-QWord
	def BCDCFN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>;
	def BCDCFZ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>;
	def BCDCTN_rec : VX_VT5_EO5_VB5_XO9_o <5, 385, "bcdctn." , []>;
	def BCDCTZ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>;
	def BCDCFSQ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>;
	def BCDCTSQ_rec : VX_VT5_EO5_VB5_XO9_o <0, 385, "bcdctsq.", []>;

	// Decimal Copy-Sign/Set-Sign
	let Defs = [CR6] in
	def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;

	def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;

	// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
	class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
	: VX_RD5_RSp5_PS1_XO9<xo,
	(outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
	!strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
	let Defs = [CR6];
	}

	// [PO VRT VRA VRB 1 / XO]
	class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
	: VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
	let Defs = [CR6];
	let PS = 0;
	}

	// Decimal Shift/Unsigned-Shift/Shift-and-Round
	def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
	def BCDUS_rec : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>;
	def BCDSR_rec : VX_VT5_VA5_VB5_PS1_XO9_o<449, "bcdsr.", []>;

	// Decimal (Unsigned) Truncate
	def BCDTRUNC_rec : VX_VT5_VA5_VB5_PS1_XO9_o<257, "bcdtrunc." , []>;
	def BCDUTRUNC_rec : VX_VT5_VA5_VB5_XO9_o <321, "bcdutrunc.", []>;

	// Absolute Difference
	def VABSDUB : VXForm_1<1027, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vabsdub $vD, $vA, $vB", IIC_VecGeneral,
	[(set v16i8:$vD, (int_ppc_altivec_vabsdub v16i8:$vA, v16i8:$vB))]>;
	def VABSDUH : VXForm_1<1091, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vabsduh $vD, $vA, $vB", IIC_VecGeneral,
	[(set v8i16:$vD, (int_ppc_altivec_vabsduh v8i16:$vA, v8i16:$vB))]>;
	def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
	"vabsduw $vD, $vA, $vB", IIC_VecGeneral,
	[(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;

	} // end HasP9Altivec
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp (revision 362609)
	@@ -1,4309 +1,4313 @@
	//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the PowerPC implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "PPCInstrInfo.h"
	#include "MCTargetDesc/PPCPredicates.h"
	#include "PPC.h"
	#include "PPCHazardRecognizers.h"
	#include "PPCInstrBuilder.h"
	#include "PPCMachineFunctionInfo.h"
	#include "PPCTargetMachine.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/LiveIntervals.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/CodeGen/ScheduleDAG.h"
	#include "llvm/CodeGen/SlotIndexes.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "ppc-instr-info"

	#define GET_INSTRMAP_INFO
	#define GET_INSTRINFO_CTOR_DTOR
	#include "PPCGenInstrInfo.inc"

	STATISTIC(NumStoreSPILLVSRRCAsVec,
	"Number of spillvsrrc spilled to stack as vec");
	STATISTIC(NumStoreSPILLVSRRCAsGpr,
	"Number of spillvsrrc spilled to stack as gpr");
	STATISTIC(NumGPRtoVSRSpill, "Number of gpr spills to spillvsrrc");
	STATISTIC(CmpIselsConverted,
	"Number of ISELs that depend on comparison of constants converted");
	STATISTIC(MissedConvertibleImmediateInstrs,
	"Number of compare-immediate instructions fed by constants");
	STATISTIC(NumRcRotatesConvertedToRcAnd,
	"Number of record-form rotates converted to record-form andi");

	static cl::
	opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
	cl::desc("Disable analysis for CTR loops"));

	static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
	cl::desc("Disable compare instruction optimization"), cl::Hidden);

	static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
	cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
	cl::Hidden);

	static cl::opt<bool>
	UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
	cl::desc("Use the old (incorrect) instruction latency calculation"));

	// Index into the OpcodesForSpill array.
	enum SpillOpcodeKey {
	SOK_Int4Spill,
	SOK_Int8Spill,
	SOK_Float8Spill,
	SOK_Float4Spill,
	SOK_CRSpill,
	SOK_CRBitSpill,
	SOK_VRVectorSpill,
	SOK_VSXVectorSpill,
	SOK_VectorFloat8Spill,
	SOK_VectorFloat4Spill,
	SOK_VRSaveSpill,
	SOK_QuadFloat8Spill,
	SOK_QuadFloat4Spill,
	SOK_QuadBitSpill,
	SOK_SpillToVSR,
	SOK_SPESpill,
	SOK_LastOpcodeSpill // This must be last on the enum.
	};

	// Pin the vtable to this file.
	void PPCInstrInfo::anchor() {}

	PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
	: PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
	/* CatchRetOpcode */ -1,
	STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
	Subtarget(STI), RI(STI.getTargetMachine()) {}

	/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
	/// this target when scheduling the DAG.
	ScheduleHazardRecognizer *
	PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
	const ScheduleDAG *DAG) const {
	unsigned Directive =
	static_cast<const PPCSubtarget *>(STI)->getCPUDirective();
	if (Directive == PPC::DIR_440 \|\| Directive == PPC::DIR_A2 \|\|
	Directive == PPC::DIR_E500mc \|\| Directive == PPC::DIR_E5500) {
	const InstrItineraryData *II =
	static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
	return new ScoreboardHazardRecognizer(II, DAG);
	}

	return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
	}

	/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
	/// to use for this target when scheduling the DAG.
	ScheduleHazardRecognizer *
	PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
	const ScheduleDAG *DAG) const {
	unsigned Directive =
	DAG->MF.getSubtarget<PPCSubtarget>().getCPUDirective();

	// FIXME: Leaving this as-is until we have POWER9 scheduling info
	if (Directive == PPC::DIR_PWR7 \|\| Directive == PPC::DIR_PWR8)
	return new PPCDispatchGroupSBHazardRecognizer(II, DAG);

	// Most subtargets use a PPC970 recognizer.
	if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
	Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
	assert(DAG->TII && "No InstrInfo?");

	return new PPCHazardRecognizer970(*DAG);
	}

	return new ScoreboardHazardRecognizer(II, DAG);
	}

	unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
	const MachineInstr &MI,
	unsigned *PredCost) const {
	if (!ItinData \|\| UseOldLatencyCalc)
	return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);

	// The default implementation of getInstrLatency calls getStageLatency, but
	// getStageLatency does not do the right thing for us. While we have
	// itinerary, most cores are fully pipelined, and so the itineraries only
	// express the first part of the pipeline, not every stage. Instead, we need
	// to use the listed output operand cycle number (using operand 0 here, which
	// is an output).

	unsigned Latency = 1;
	unsigned DefClass = MI.getDesc().getSchedClass();
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (!MO.isReg() \|\| !MO.isDef() \|\| MO.isImplicit())
	continue;

	int Cycle = ItinData->getOperandCycle(DefClass, i);
	if (Cycle < 0)
	continue;

	Latency = std::max(Latency, (unsigned) Cycle);
	}

	return Latency;
	}

	int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
	const MachineInstr &DefMI, unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const {
	int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
	UseMI, UseIdx);

	if (!DefMI.getParent())
	return Latency;

	const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
	Register Reg = DefMO.getReg();

	bool IsRegCR;
	if (Register::isVirtualRegister(Reg)) {
	const MachineRegisterInfo *MRI =
	&DefMI.getParent()->getParent()->getRegInfo();
	IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) \|\|
	MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
	} else {
	IsRegCR = PPC::CRRCRegClass.contains(Reg) \|\|
	PPC::CRBITRCRegClass.contains(Reg);
	}

	if (UseMI.isBranch() && IsRegCR) {
	if (Latency < 0)
	Latency = getInstrLatency(ItinData, DefMI);

	// On some cores, there is an additional delay between writing to a condition
	// register, and using it from a branch.
	unsigned Directive = Subtarget.getCPUDirective();
	switch (Directive) {
	default: break;
	case PPC::DIR_7400:
	case PPC::DIR_750:
	case PPC::DIR_970:
	case PPC::DIR_E5500:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	// FIXME: Is this needed for POWER9?
	Latency += 2;
	break;
	}
	}

	return Latency;
	}

	// This function does not list all associative and commutative operations, but
	// only those worth feeding through the machine combiner in an attempt to
	// reduce the critical path. Mostly, this means floating-point operations,
	// because they have high latencies (compared to other operations, such and
	// and/or, which are also associative and commutative, but have low latencies).
	bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
	switch (Inst.getOpcode()) {
	// FP Add:
	case PPC::FADD:
	case PPC::FADDS:
	// FP Multiply:
	case PPC::FMUL:
	case PPC::FMULS:
	// Altivec Add:
	case PPC::VADDFP:
	// VSX Add:
	case PPC::XSADDDP:
	case PPC::XVADDDP:
	case PPC::XVADDSP:
	case PPC::XSADDSP:
	// VSX Multiply:
	case PPC::XSMULDP:
	case PPC::XVMULDP:
	case PPC::XVMULSP:
	case PPC::XSMULSP:
	// QPX Add:
	case PPC::QVFADD:
	case PPC::QVFADDS:
	case PPC::QVFADDSs:
	// QPX Multiply:
	case PPC::QVFMUL:
	case PPC::QVFMULS:
	case PPC::QVFMULSs:
	return true;
	default:
	return false;
	}
	}

	bool PPCInstrInfo::getMachineCombinerPatterns(
	MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
	// Using the machine combiner in this way is potentially expensive, so
	// restrict to when aggressive optimizations are desired.
	if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
	return false;

	// FP reassociation is only legal when we don't need strict IEEE semantics.
	if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
	return false;

	return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
	}

	// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
	bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
	unsigned &SrcReg, unsigned &DstReg,
	unsigned &SubIdx) const {
	switch (MI.getOpcode()) {
	default: return false;
	case PPC::EXTSW:
	case PPC::EXTSW_32:
	case PPC::EXTSW_32_64:
	SrcReg = MI.getOperand(1).getReg();
	DstReg = MI.getOperand(0).getReg();
	SubIdx = PPC::sub_32;
	return true;
	}
	}

	unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	unsigned Opcode = MI.getOpcode();
	const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
	const unsigned *End = OpcodesForSpill + SOK_LastOpcodeSpill;

	if (End != std::find(OpcodesForSpill, End, Opcode)) {
	// Check for the operands added by addFrameReference (the immediate is the
	// offset which defaults to 0).
	if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
	MI.getOperand(2).isFI()) {
	FrameIndex = MI.getOperand(2).getIndex();
	return MI.getOperand(0).getReg();
	}
	}
	return 0;
	}

	// For opcodes with the ReMaterializable flag set, this function is called to
	// verify the instruction is really rematable.
	bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA) const {
	switch (MI.getOpcode()) {
	default:
	// This function should only be called for opcodes with the ReMaterializable
	// flag set.
	llvm_unreachable("Unknown rematerializable operation!");
	break;
	case PPC::LI:
	case PPC::LI8:
	case PPC::LIS:
	case PPC::LIS8:
	case PPC::QVGPCI:
	case PPC::ADDIStocHA:
	case PPC::ADDIStocHA8:
	case PPC::ADDItocL:
	case PPC::LOAD_STACK_GUARD:
	case PPC::XXLXORz:
	case PPC::XXLXORspz:
	case PPC::XXLXORdpz:
	case PPC::XXLEQVOnes:
	case PPC::V_SET0B:
	case PPC::V_SET0H:
	case PPC::V_SET0:
	case PPC::V_SETALLONESB:
	case PPC::V_SETALLONESH:
	case PPC::V_SETALLONES:
	case PPC::CRSET:
	case PPC::CRUNSET:
	return true;
	}
	return false;
	}

	unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	unsigned Opcode = MI.getOpcode();
	const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
	const unsigned *End = OpcodesForSpill + SOK_LastOpcodeSpill;

	if (End != std::find(OpcodesForSpill, End, Opcode)) {
	if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
	MI.getOperand(2).isFI()) {
	FrameIndex = MI.getOperand(2).getIndex();
	return MI.getOperand(0).getReg();
	}
	}
	return 0;
	}

	MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
	unsigned OpIdx1,
	unsigned OpIdx2) const {
	MachineFunction &MF = *MI.getParent()->getParent();

	// Normal instructions can be commuted the obvious way.
	if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMI_rec)
	return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
	// Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
	// 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
	// changing the relative order of the mask operands might change what happens
	// to the high-bits of the mask (and, thus, the result).

	// Cannot commute if it has a non-zero rotate count.
	if (MI.getOperand(3).getImm() != 0)
	return nullptr;

	// If we have a zero rotate count, we have:
	// M = mask(MB,ME)
	// Op0 = (Op1 & ~M) \| (Op2 & M)
	// Change this to:
	// M = mask((ME+1)&31, (MB-1)&31)
	// Op0 = (Op2 & ~M) \| (Op1 & M)

	// Swap op1/op2
	assert(((OpIdx1 == 1 && OpIdx2 == 2) \|\| (OpIdx1 == 2 && OpIdx2 == 1)) &&
	"Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMI_rec.");
	Register Reg0 = MI.getOperand(0).getReg();
	Register Reg1 = MI.getOperand(1).getReg();
	Register Reg2 = MI.getOperand(2).getReg();
	unsigned SubReg1 = MI.getOperand(1).getSubReg();
	unsigned SubReg2 = MI.getOperand(2).getSubReg();
	bool Reg1IsKill = MI.getOperand(1).isKill();
	bool Reg2IsKill = MI.getOperand(2).isKill();
	bool ChangeReg0 = false;
	// If machine instrs are no longer in two-address forms, update
	// destination register as well.
	if (Reg0 == Reg1) {
	// Must be two address instruction!
	assert(MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
	"Expecting a two-address instruction!");
	assert(MI.getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
	Reg2IsKill = false;
	ChangeReg0 = true;
	}

	// Masks.
	unsigned MB = MI.getOperand(4).getImm();
	unsigned ME = MI.getOperand(5).getImm();

	// We can't commute a trivial mask (there is no way to represent an all-zero
	// mask).
	if (MB == 0 && ME == 31)
	return nullptr;

	if (NewMI) {
	// Create a new instruction.
	Register Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg();
	bool Reg0IsDead = MI.getOperand(0).isDead();
	return BuildMI(MF, MI.getDebugLoc(), MI.getDesc())
	.addReg(Reg0, RegState::Define \| getDeadRegState(Reg0IsDead))
	.addReg(Reg2, getKillRegState(Reg2IsKill))
	.addReg(Reg1, getKillRegState(Reg1IsKill))
	.addImm((ME + 1) & 31)
	.addImm((MB - 1) & 31);
	}

	if (ChangeReg0) {
	MI.getOperand(0).setReg(Reg2);
	MI.getOperand(0).setSubReg(SubReg2);
	}
	MI.getOperand(2).setReg(Reg1);
	MI.getOperand(1).setReg(Reg2);
	MI.getOperand(2).setSubReg(SubReg1);
	MI.getOperand(1).setSubReg(SubReg2);
	MI.getOperand(2).setIsKill(Reg1IsKill);
	MI.getOperand(1).setIsKill(Reg2IsKill);

	// Swap the mask around.
	MI.getOperand(4).setImm((ME + 1) & 31);
	MI.getOperand(5).setImm((MB - 1) & 31);
	return &MI;
	}

	bool PPCInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
	unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const {
	// For VSX A-Type FMA instructions, it is the first two operands that can be
	// commuted, however, because the non-encoded tied input operand is listed
	// first, the operands to swap are actually the second and third.

	int AltOpc = PPC::getAltVSXFMAOpcode(MI.getOpcode());
	if (AltOpc == -1)
	return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);

	// The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
	// and SrcOpIdx2.
	return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
	}

	void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI) const {
	// This function is used for scheduling, and the nop wanted here is the type
	// that terminates dispatch groups on the POWER cores.
	unsigned Directive = Subtarget.getCPUDirective();
	unsigned Opcode;
	switch (Directive) {
	default: Opcode = PPC::NOP; break;
	case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
	case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
	case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */
	// FIXME: Update when POWER9 scheduling model is ready.
	case PPC::DIR_PWR9: Opcode = PPC::NOP_GT_PWR7; break;
	}

	DebugLoc DL;
	BuildMI(MBB, MI, DL, get(Opcode));
	}

	/// Return the noop instruction to use for a noop.
	void PPCInstrInfo::getNoop(MCInst &NopInst) const {
	NopInst.setOpcode(PPC::NOP);
	}

	// Branch analysis.
	// Note: If the condition register is set to CTR or CTR8 then this is a
	// BDNZ (imm == 1) or BDZ (imm == 0) branch.
	bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	bool isPPC64 = Subtarget.isPPC64();

	// If the block has no terminators, it just falls into the block after it.
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return false;

	if (!isUnpredicatedTerminator(*I))
	return false;

	if (AllowModify) {
	// If the BB ends with an unconditional branch to the fallthrough BB,
	// we eliminate the branch instruction.
	if (I->getOpcode() == PPC::B &&
	MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
	I->eraseFromParent();

	// We update iterator after deleting the last branch.
	I = MBB.getLastNonDebugInstr();
	if (I == MBB.end() \|\| !isUnpredicatedTerminator(*I))
	return false;
	}
	}

	// Get the last instruction in the block.
	MachineInstr &LastInst = *I;

	// If there is only one terminator instruction, process it.
	if (I == MBB.begin() \|\| !isUnpredicatedTerminator(*--I)) {
	if (LastInst.getOpcode() == PPC::B) {
	if (!LastInst.getOperand(0).isMBB())
	return true;
	TBB = LastInst.getOperand(0).getMBB();
	return false;
	} else if (LastInst.getOpcode() == PPC::BCC) {
	if (!LastInst.getOperand(2).isMBB())
	return true;
	// Block ends with fall-through condbranch.
	TBB = LastInst.getOperand(2).getMBB();
	Cond.push_back(LastInst.getOperand(0));
	Cond.push_back(LastInst.getOperand(1));
	return false;
	} else if (LastInst.getOpcode() == PPC::BC) {
	if (!LastInst.getOperand(1).isMBB())
	return true;
	// Block ends with fall-through condbranch.
	TBB = LastInst.getOperand(1).getMBB();
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
	Cond.push_back(LastInst.getOperand(0));
	return false;
	} else if (LastInst.getOpcode() == PPC::BCn) {
	if (!LastInst.getOperand(1).isMBB())
	return true;
	// Block ends with fall-through condbranch.
	TBB = LastInst.getOperand(1).getMBB();
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
	Cond.push_back(LastInst.getOperand(0));
	return false;
	} else if (LastInst.getOpcode() == PPC::BDNZ8 \|\|
	LastInst.getOpcode() == PPC::BDNZ) {
	if (!LastInst.getOperand(0).isMBB())
	return true;
	if (DisableCTRLoopAnal)
	return true;
	TBB = LastInst.getOperand(0).getMBB();
	Cond.push_back(MachineOperand::CreateImm(1));
	Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
	true));
	return false;
	} else if (LastInst.getOpcode() == PPC::BDZ8 \|\|
	LastInst.getOpcode() == PPC::BDZ) {
	if (!LastInst.getOperand(0).isMBB())
	return true;
	if (DisableCTRLoopAnal)
	return true;
	TBB = LastInst.getOperand(0).getMBB();
	Cond.push_back(MachineOperand::CreateImm(0));
	Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
	true));
	return false;
	}

	// Otherwise, don't know what this is.
	return true;
	}

	// Get the instruction before it if it's a terminator.
	MachineInstr &SecondLastInst = *I;

	// If there are three terminators, we don't know what sort of block this is.
	if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
	return true;

	// If the block ends with PPC::B and PPC:BCC, handle it.
	if (SecondLastInst.getOpcode() == PPC::BCC &&
	LastInst.getOpcode() == PPC::B) {
	if (!SecondLastInst.getOperand(2).isMBB() \|\|
	!LastInst.getOperand(0).isMBB())
	return true;
	TBB = SecondLastInst.getOperand(2).getMBB();
	Cond.push_back(SecondLastInst.getOperand(0));
	Cond.push_back(SecondLastInst.getOperand(1));
	FBB = LastInst.getOperand(0).getMBB();
	return false;
	} else if (SecondLastInst.getOpcode() == PPC::BC &&
	LastInst.getOpcode() == PPC::B) {
	if (!SecondLastInst.getOperand(1).isMBB() \|\|
	!LastInst.getOperand(0).isMBB())
	return true;
	TBB = SecondLastInst.getOperand(1).getMBB();
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
	Cond.push_back(SecondLastInst.getOperand(0));
	FBB = LastInst.getOperand(0).getMBB();
	return false;
	} else if (SecondLastInst.getOpcode() == PPC::BCn &&
	LastInst.getOpcode() == PPC::B) {
	if (!SecondLastInst.getOperand(1).isMBB() \|\|
	!LastInst.getOperand(0).isMBB())
	return true;
	TBB = SecondLastInst.getOperand(1).getMBB();
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
	Cond.push_back(SecondLastInst.getOperand(0));
	FBB = LastInst.getOperand(0).getMBB();
	return false;
	} else if ((SecondLastInst.getOpcode() == PPC::BDNZ8 \|\|
	SecondLastInst.getOpcode() == PPC::BDNZ) &&
	LastInst.getOpcode() == PPC::B) {
	if (!SecondLastInst.getOperand(0).isMBB() \|\|
	!LastInst.getOperand(0).isMBB())
	return true;
	if (DisableCTRLoopAnal)
	return true;
	TBB = SecondLastInst.getOperand(0).getMBB();
	Cond.push_back(MachineOperand::CreateImm(1));
	Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
	true));
	FBB = LastInst.getOperand(0).getMBB();
	return false;
	} else if ((SecondLastInst.getOpcode() == PPC::BDZ8 \|\|
	SecondLastInst.getOpcode() == PPC::BDZ) &&
	LastInst.getOpcode() == PPC::B) {
	if (!SecondLastInst.getOperand(0).isMBB() \|\|
	!LastInst.getOperand(0).isMBB())
	return true;
	if (DisableCTRLoopAnal)
	return true;
	TBB = SecondLastInst.getOperand(0).getMBB();
	Cond.push_back(MachineOperand::CreateImm(0));
	Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
	true));
	FBB = LastInst.getOperand(0).getMBB();
	return false;
	}

	// If the block ends with two PPC:Bs, handle it. The second one is not
	// executed, so remove it.
	if (SecondLastInst.getOpcode() == PPC::B && LastInst.getOpcode() == PPC::B) {
	if (!SecondLastInst.getOperand(0).isMBB())
	return true;
	TBB = SecondLastInst.getOperand(0).getMBB();
	I = LastInst;
	if (AllowModify)
	I->eraseFromParent();
	return false;
	}

	// Otherwise, can't handle this.
	return true;
	}

	unsigned PPCInstrInfo::removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved) const {
	assert(!BytesRemoved && "code size not handled");

	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return 0;

	if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
	I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
	I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
	I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ)
	return 0;

	// Remove the branch.
	I->eraseFromParent();

	I = MBB.end();

	if (I == MBB.begin()) return 1;
	--I;
	if (I->getOpcode() != PPC::BCC &&
	I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
	I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
	I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ)
	return 1;

	// Remove the branch.
	I->eraseFromParent();
	return 2;
	}

	unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	ArrayRef<MachineOperand> Cond,
	const DebugLoc &DL,
	int *BytesAdded) const {
	// Shouldn't be a fall through.
	assert(TBB && "insertBranch must not be told to insert a fallthrough");
	assert((Cond.size() == 2 \|\| Cond.size() == 0) &&
	"PPC branch conditions have two components!");
	assert(!BytesAdded && "code size not handled");

	bool isPPC64 = Subtarget.isPPC64();

	// One-way branch.
	if (!FBB) {
	if (Cond.empty()) // Unconditional branch
	BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
	else if (Cond[1].getReg() == PPC::CTR \|\| Cond[1].getReg() == PPC::CTR8)
	BuildMI(&MBB, DL, get(Cond[0].getImm() ?
	(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
	(isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB);
	else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
	BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB);
	else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
	BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB);
	else // Conditional branch
	BuildMI(&MBB, DL, get(PPC::BCC))
	.addImm(Cond[0].getImm())
	.add(Cond[1])
	.addMBB(TBB);
	return 1;
	}

	// Two-way Conditional Branch.
	if (Cond[1].getReg() == PPC::CTR \|\| Cond[1].getReg() == PPC::CTR8)
	BuildMI(&MBB, DL, get(Cond[0].getImm() ?
	(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
	(isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB);
	else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
	BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB);
	else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
	BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB);
	else
	BuildMI(&MBB, DL, get(PPC::BCC))
	.addImm(Cond[0].getImm())
	.add(Cond[1])
	.addMBB(TBB);
	BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
	return 2;
	}

	// Select analysis.
	bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg,
	int &CondCycles, int &TrueCycles, int &FalseCycles) const {
	if (Cond.size() != 2)
	return false;

	// If this is really a bdnz-like condition, then it cannot be turned into a
	// select.
	if (Cond[1].getReg() == PPC::CTR \|\| Cond[1].getReg() == PPC::CTR8)
	return false;

	// Check register classes.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC =
	RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
	if (!RC)
	return false;

	// isel is for regular integer GPRs only.
	if (!PPC::GPRCRegClass.hasSubClassEq(RC) &&
	!PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) &&
	!PPC::G8RCRegClass.hasSubClassEq(RC) &&
	!PPC::G8RC_NOX0RegClass.hasSubClassEq(RC))
	return false;

	// FIXME: These numbers are for the A2, how well they work for other cores is
	// an open question. On the A2, the isel instruction has a 2-cycle latency
	// but single-cycle throughput. These numbers are used in combination with
	// the MispredictPenalty setting from the active SchedMachineModel.
	CondCycles = 1;
	TrueCycles = 1;
	FalseCycles = 1;

	return true;
	}

	void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	const DebugLoc &dl, unsigned DestReg,
	ArrayRef<MachineOperand> Cond, unsigned TrueReg,
	unsigned FalseReg) const {
	assert(Cond.size() == 2 &&
	"PPC branch conditions have two components!");

	// Get the register classes.
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC =
	RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
	assert(RC && "TrueReg and FalseReg must have overlapping register classes");

	bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) \|\|
	PPC::G8RC_NOX0RegClass.hasSubClassEq(RC);
	assert((Is64Bit \|\|
	PPC::GPRCRegClass.hasSubClassEq(RC) \|\|
	PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) &&
	"isel is for regular integer GPRs only");

	unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
	auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm());

	unsigned SubIdx = 0;
	bool SwapOps = false;
	switch (SelectPred) {
	case PPC::PRED_EQ:
	case PPC::PRED_EQ_MINUS:
	case PPC::PRED_EQ_PLUS:
	SubIdx = PPC::sub_eq; SwapOps = false; break;
	case PPC::PRED_NE:
	case PPC::PRED_NE_MINUS:
	case PPC::PRED_NE_PLUS:
	SubIdx = PPC::sub_eq; SwapOps = true; break;
	case PPC::PRED_LT:
	case PPC::PRED_LT_MINUS:
	case PPC::PRED_LT_PLUS:
	SubIdx = PPC::sub_lt; SwapOps = false; break;
	case PPC::PRED_GE:
	case PPC::PRED_GE_MINUS:
	case PPC::PRED_GE_PLUS:
	SubIdx = PPC::sub_lt; SwapOps = true; break;
	case PPC::PRED_GT:
	case PPC::PRED_GT_MINUS:
	case PPC::PRED_GT_PLUS:
	SubIdx = PPC::sub_gt; SwapOps = false; break;
	case PPC::PRED_LE:
	case PPC::PRED_LE_MINUS:
	case PPC::PRED_LE_PLUS:
	SubIdx = PPC::sub_gt; SwapOps = true; break;
	case PPC::PRED_UN:
	case PPC::PRED_UN_MINUS:
	case PPC::PRED_UN_PLUS:
	SubIdx = PPC::sub_un; SwapOps = false; break;
	case PPC::PRED_NU:
	case PPC::PRED_NU_MINUS:
	case PPC::PRED_NU_PLUS:
	SubIdx = PPC::sub_un; SwapOps = true; break;
	case PPC::PRED_BIT_SET: SubIdx = 0; SwapOps = false; break;
	case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
	}

	unsigned FirstReg = SwapOps ? FalseReg : TrueReg,
	SecondReg = SwapOps ? TrueReg : FalseReg;

	// The first input register of isel cannot be r0. If it is a member
	// of a register class that can be r0, then copy it first (the
	// register allocator should eliminate the copy).
	if (MRI.getRegClass(FirstReg)->contains(PPC::R0) \|\|
	MRI.getRegClass(FirstReg)->contains(PPC::X0)) {
	const TargetRegisterClass *FirstRC =
	MRI.getRegClass(FirstReg)->contains(PPC::X0) ?
	&PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass;
	unsigned OldFirstReg = FirstReg;
	FirstReg = MRI.createVirtualRegister(FirstRC);
	BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg)
	.addReg(OldFirstReg);
	}

	BuildMI(MBB, MI, dl, get(OpCode), DestReg)
	.addReg(FirstReg).addReg(SecondReg)
	.addReg(Cond[1].getReg(), 0, SubIdx);
	}

	static unsigned getCRBitValue(unsigned CRBit) {
	unsigned Ret = 4;
	if (CRBit == PPC::CR0LT \|\| CRBit == PPC::CR1LT \|\|
	CRBit == PPC::CR2LT \|\| CRBit == PPC::CR3LT \|\|
	CRBit == PPC::CR4LT \|\| CRBit == PPC::CR5LT \|\|
	CRBit == PPC::CR6LT \|\| CRBit == PPC::CR7LT)
	Ret = 3;
	if (CRBit == PPC::CR0GT \|\| CRBit == PPC::CR1GT \|\|
	CRBit == PPC::CR2GT \|\| CRBit == PPC::CR3GT \|\|
	CRBit == PPC::CR4GT \|\| CRBit == PPC::CR5GT \|\|
	CRBit == PPC::CR6GT \|\| CRBit == PPC::CR7GT)
	Ret = 2;
	if (CRBit == PPC::CR0EQ \|\| CRBit == PPC::CR1EQ \|\|
	CRBit == PPC::CR2EQ \|\| CRBit == PPC::CR3EQ \|\|
	CRBit == PPC::CR4EQ \|\| CRBit == PPC::CR5EQ \|\|
	CRBit == PPC::CR6EQ \|\| CRBit == PPC::CR7EQ)
	Ret = 1;
	if (CRBit == PPC::CR0UN \|\| CRBit == PPC::CR1UN \|\|
	CRBit == PPC::CR2UN \|\| CRBit == PPC::CR3UN \|\|
	CRBit == PPC::CR4UN \|\| CRBit == PPC::CR5UN \|\|
	CRBit == PPC::CR6UN \|\| CRBit == PPC::CR7UN)
	Ret = 0;

	assert(Ret != 4 && "Invalid CR bit register");
	return Ret;
	}

	void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, MCRegister DestReg,
	MCRegister SrcReg, bool KillSrc) const {
	// We can end up with self copies and similar things as a result of VSX copy
	// legalization. Promote them here.
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	if (PPC::F8RCRegClass.contains(DestReg) &&
	PPC::VSRCRegClass.contains(SrcReg)) {
	MCRegister SuperReg =
	TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass);

	if (VSXSelfCopyCrash && SrcReg == SuperReg)
	llvm_unreachable("nop VSX copy");

	DestReg = SuperReg;
	} else if (PPC::F8RCRegClass.contains(SrcReg) &&
	PPC::VSRCRegClass.contains(DestReg)) {
	MCRegister SuperReg =
	TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass);

	if (VSXSelfCopyCrash && DestReg == SuperReg)
	llvm_unreachable("nop VSX copy");

	SrcReg = SuperReg;
	}

	// Different class register copy
	if (PPC::CRBITRCRegClass.contains(SrcReg) &&
	PPC::GPRCRegClass.contains(DestReg)) {
	MCRegister CRReg = getCRFromCRBit(SrcReg);
	BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(CRReg);
	getKillRegState(KillSrc);
	// Rotate the CR bit in the CR fields to be the least significant bit and
	// then mask with 0x1 (MB = ME = 31).
	BuildMI(MBB, I, DL, get(PPC::RLWINM), DestReg)
	.addReg(DestReg, RegState::Kill)
	.addImm(TRI->getEncodingValue(CRReg) * 4 + (4 - getCRBitValue(SrcReg)))
	.addImm(31)
	.addImm(31);
	return;
	} else if (PPC::CRRCRegClass.contains(SrcReg) &&
	PPC::G8RCRegClass.contains(DestReg)) {
	BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg);
	getKillRegState(KillSrc);
	return;
	} else if (PPC::CRRCRegClass.contains(SrcReg) &&
	PPC::GPRCRegClass.contains(DestReg)) {
	BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
	getKillRegState(KillSrc);
	return;
	} else if (PPC::G8RCRegClass.contains(SrcReg) &&
	PPC::VSFRCRegClass.contains(DestReg)) {
	assert(Subtarget.hasDirectMove() &&
	"Subtarget doesn't support directmove, don't know how to copy.");
	BuildMI(MBB, I, DL, get(PPC::MTVSRD), DestReg).addReg(SrcReg);
	NumGPRtoVSRSpill++;
	getKillRegState(KillSrc);
	return;
	} else if (PPC::VSFRCRegClass.contains(SrcReg) &&
	PPC::G8RCRegClass.contains(DestReg)) {
	assert(Subtarget.hasDirectMove() &&
	"Subtarget doesn't support directmove, don't know how to copy.");
	BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg);
	getKillRegState(KillSrc);
	return;
	} else if (PPC::SPERCRegClass.contains(SrcReg) &&
	PPC::GPRCRegClass.contains(DestReg)) {
	BuildMI(MBB, I, DL, get(PPC::EFSCFD), DestReg).addReg(SrcReg);
	getKillRegState(KillSrc);
	return;
	} else if (PPC::GPRCRegClass.contains(SrcReg) &&
	PPC::SPERCRegClass.contains(DestReg)) {
	BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg);
	getKillRegState(KillSrc);
	return;
	}

	unsigned Opc;
	if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::OR;
	else if (PPC::G8RCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::OR8;
	else if (PPC::F4RCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::FMR;
	else if (PPC::CRRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::MCRF;
	else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::VOR;
	else if (PPC::VSRCRegClass.contains(DestReg, SrcReg))
	// There are two different ways this can be done:
	// 1. xxlor : This has lower latency (on the P7), 2 cycles, but can only
	// issue in VSU pipeline 0.
	// 2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but
	// can go to either pipeline.
	// We'll always use xxlor here, because in practically all cases where
	// copies are generated, they are close enough to some use that the
	// lower-latency form is preferable.
	Opc = PPC::XXLOR;
	else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) \|\|
	PPC::VSSRCRegClass.contains(DestReg, SrcReg))
	Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf;
	else if (PPC::QFRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::QVFMR;
	else if (PPC::QSRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::QVFMRs;
	else if (PPC::QBRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::QVFMRb;
	else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::CROR;
	else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
	Opc = PPC::EVOR;
	else
	llvm_unreachable("Impossible reg-to-reg copy");

	const MCInstrDesc &MCID = get(Opc);
	if (MCID.getNumOperands() == 3)
	BuildMI(MBB, I, DL, MCID, DestReg)
	.addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
	else
	BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
	}

	unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
	const TargetRegisterClass *RC)
	const {
	const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
	int OpcodeIndex = 0;

	if (RC != nullptr) {
	if (PPC::GPRCRegClass.hasSubClassEq(RC) \|\|
	PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Int4Spill;
	} else if (PPC::G8RCRegClass.hasSubClassEq(RC) \|\|
	PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Int8Spill;
	} else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Float8Spill;
	} else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Float4Spill;
	} else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_SPESpill;
	} else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_CRSpill;
	} else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_CRBitSpill;
	} else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VRVectorSpill;
	} else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VSXVectorSpill;
	} else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VectorFloat8Spill;
	} else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VectorFloat4Spill;
	} else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VRSaveSpill;
	} else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_QuadFloat8Spill;
	} else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_QuadFloat4Spill;
	} else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_QuadBitSpill;
	} else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_SpillToVSR;
	} else {
	llvm_unreachable("Unknown regclass!");
	}
	} else {
	if (PPC::GPRCRegClass.contains(Reg) \|\|
	PPC::GPRC_NOR0RegClass.contains(Reg)) {
	OpcodeIndex = SOK_Int4Spill;
	} else if (PPC::G8RCRegClass.contains(Reg) \|\|
	PPC::G8RC_NOX0RegClass.contains(Reg)) {
	OpcodeIndex = SOK_Int8Spill;
	} else if (PPC::F8RCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_Float8Spill;
	} else if (PPC::F4RCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_Float4Spill;
	} else if (PPC::SPERCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_SPESpill;
	} else if (PPC::CRRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_CRSpill;
	} else if (PPC::CRBITRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_CRBitSpill;
	} else if (PPC::VRRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VRVectorSpill;
	} else if (PPC::VSRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VSXVectorSpill;
	} else if (PPC::VSFRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VectorFloat8Spill;
	} else if (PPC::VSSRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VectorFloat4Spill;
	} else if (PPC::VRSAVERCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VRSaveSpill;
	} else if (PPC::QFRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_QuadFloat8Spill;
	} else if (PPC::QSRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_QuadFloat4Spill;
	} else if (PPC::QBRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_QuadBitSpill;
	} else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_SpillToVSR;
	} else {
	llvm_unreachable("Unknown regclass!");
	}
	}
	return OpcodesForSpill[OpcodeIndex];
	}

	unsigned
	PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
	const TargetRegisterClass *RC) const {
	const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
	int OpcodeIndex = 0;

	if (RC != nullptr) {
	if (PPC::GPRCRegClass.hasSubClassEq(RC) \|\|
	PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Int4Spill;
	} else if (PPC::G8RCRegClass.hasSubClassEq(RC) \|\|
	PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Int8Spill;
	} else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Float8Spill;
	} else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_Float4Spill;
	} else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_SPESpill;
	} else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_CRSpill;
	} else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_CRBitSpill;
	} else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VRVectorSpill;
	} else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VSXVectorSpill;
	} else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VectorFloat8Spill;
	} else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VectorFloat4Spill;
	} else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_VRSaveSpill;
	} else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_QuadFloat8Spill;
	} else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_QuadFloat4Spill;
	} else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_QuadBitSpill;
	} else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
	OpcodeIndex = SOK_SpillToVSR;
	} else {
	llvm_unreachable("Unknown regclass!");
	}
	} else {
	if (PPC::GPRCRegClass.contains(Reg) \|\|
	PPC::GPRC_NOR0RegClass.contains(Reg)) {
	OpcodeIndex = SOK_Int4Spill;
	} else if (PPC::G8RCRegClass.contains(Reg) \|\|
	PPC::G8RC_NOX0RegClass.contains(Reg)) {
	OpcodeIndex = SOK_Int8Spill;
	} else if (PPC::F8RCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_Float8Spill;
	} else if (PPC::F4RCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_Float4Spill;
	} else if (PPC::SPERCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_SPESpill;
	} else if (PPC::CRRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_CRSpill;
	} else if (PPC::CRBITRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_CRBitSpill;
	} else if (PPC::VRRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VRVectorSpill;
	} else if (PPC::VSRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VSXVectorSpill;
	} else if (PPC::VSFRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VectorFloat8Spill;
	} else if (PPC::VSSRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VectorFloat4Spill;
	} else if (PPC::VRSAVERCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_VRSaveSpill;
	} else if (PPC::QFRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_QuadFloat8Spill;
	} else if (PPC::QSRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_QuadFloat4Spill;
	} else if (PPC::QBRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_QuadBitSpill;
	} else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
	OpcodeIndex = SOK_SpillToVSR;
	} else {
	llvm_unreachable("Unknown regclass!");
	}
	}
	return OpcodesForSpill[OpcodeIndex];
	}

	void PPCInstrInfo::StoreRegToStackSlot(
	MachineFunction &MF, unsigned SrcReg, bool isKill, int FrameIdx,
	const TargetRegisterClass *RC,
	SmallVectorImpl<MachineInstr *> &NewMIs) const {
	unsigned Opcode = getStoreOpcodeForSpill(PPC::NoRegister, RC);
	DebugLoc DL;

	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setHasSpills();

	NewMIs.push_back(addFrameReference(
	BuildMI(MF, DL, get(Opcode)).addReg(SrcReg, getKillRegState(isKill)),
	FrameIdx));

	if (PPC::CRRCRegClass.hasSubClassEq(RC) \|\|
	PPC::CRBITRCRegClass.hasSubClassEq(RC))
	FuncInfo->setSpillsCR();

	if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
	FuncInfo->setSpillsVRSAVE();

	if (isXFormMemOp(Opcode))
	FuncInfo->setHasNonRISpills();
	}

	void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned SrcReg, bool isKill,
	int FrameIdx,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	SmallVector<MachineInstr *, 4> NewMIs;

	// We need to avoid a situation in which the value from a VRRC register is
	// spilled using an Altivec instruction and reloaded into a VSRC register
	// using a VSX instruction. The issue with this is that the VSX
	// load/store instructions swap the doublewords in the vector and the Altivec
	// ones don't. The register classes on the spill/reload may be different if
	// the register is defined using an Altivec instruction and is then used by a
	// VSX instruction.
	RC = updatedRC(RC);

	StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs);

	for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
	MBB.insert(MI, NewMIs[i]);

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdx),
	MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
	MFI.getObjectAlignment(FrameIdx));
	NewMIs.back()->addMemOperand(MF, MMO);
	}

	void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
	unsigned DestReg, int FrameIdx,
	const TargetRegisterClass *RC,
	SmallVectorImpl<MachineInstr *> &NewMIs)
	const {
	unsigned Opcode = getLoadOpcodeForSpill(PPC::NoRegister, RC);
	NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opcode), DestReg),
	FrameIdx));
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	if (PPC::CRRCRegClass.hasSubClassEq(RC) \|\|
	PPC::CRBITRCRegClass.hasSubClassEq(RC))
	FuncInfo->setSpillsCR();

	if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
	FuncInfo->setSpillsVRSAVE();

	if (isXFormMemOp(Opcode))
	FuncInfo->setHasNonRISpills();
	}

	void
	PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned DestReg, int FrameIdx,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	SmallVector<MachineInstr*, 4> NewMIs;
	DebugLoc DL;
	if (MI != MBB.end()) DL = MI->getDebugLoc();

	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setHasSpills();

	// We need to avoid a situation in which the value from a VRRC register is
	// spilled using an Altivec instruction and reloaded into a VSRC register
	// using a VSX instruction. The issue with this is that the VSX
	// load/store instructions swap the doublewords in the vector and the Altivec
	// ones don't. The register classes on the spill/reload may be different if
	// the register is defined using an Altivec instruction and is then used by a
	// VSX instruction.
	if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
	RC = &PPC::VSRCRegClass;

	LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);

	for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
	MBB.insert(MI, NewMIs[i]);

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdx),
	MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
	MFI.getObjectAlignment(FrameIdx));
	NewMIs.back()->addMemOperand(MF, MMO);
	}

	bool PPCInstrInfo::
	reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
	assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
	if (Cond[1].getReg() == PPC::CTR8 \|\| Cond[1].getReg() == PPC::CTR)
	Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
	else
	// Leave the CR# the same, but invert the condition.
	Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
	return false;
	}

	bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
	unsigned Reg, MachineRegisterInfo *MRI) const {
	// For some instructions, it is legal to fold ZERO into the RA register field.
	// A zero immediate should always be loaded with a single li.
	unsigned DefOpc = DefMI.getOpcode();
	if (DefOpc != PPC::LI && DefOpc != PPC::LI8)
	return false;
	if (!DefMI.getOperand(1).isImm())
	return false;
	if (DefMI.getOperand(1).getImm() != 0)
	return false;

	// Note that we cannot here invert the arguments of an isel in order to fold
	// a ZERO into what is presented as the second argument. All we have here
	// is the condition bit, and that might come from a CR-logical bit operation.

	const MCInstrDesc &UseMCID = UseMI.getDesc();

	// Only fold into real machine instructions.
	if (UseMCID.isPseudo())
	return false;

	unsigned UseIdx;
	for (UseIdx = 0; UseIdx < UseMI.getNumOperands(); ++UseIdx)
	if (UseMI.getOperand(UseIdx).isReg() &&
	UseMI.getOperand(UseIdx).getReg() == Reg)
	break;

	assert(UseIdx < UseMI.getNumOperands() && "Cannot find Reg in UseMI");
	assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg");

	const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx];

	// We can fold the zero if this register requires a GPRC_NOR0/G8RC_NOX0
	// register (which might also be specified as a pointer class kind).
	if (UseInfo->isLookupPtrRegClass()) {
	if (UseInfo->RegClass /* Kind */ != 1)
	return false;
	} else {
	if (UseInfo->RegClass != PPC::GPRC_NOR0RegClassID &&
	UseInfo->RegClass != PPC::G8RC_NOX0RegClassID)
	return false;
	}

	// Make sure this is not tied to an output register (or otherwise
	// constrained). This is true for ST?UX registers, for example, which
	// are tied to their output registers.
	if (UseInfo->Constraints != 0)
	return false;

	unsigned ZeroReg;
	if (UseInfo->isLookupPtrRegClass()) {
	bool isPPC64 = Subtarget.isPPC64();
	ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
	} else {
	ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ?
	PPC::ZERO8 : PPC::ZERO;
	}

	bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
	UseMI.getOperand(UseIdx).setReg(ZeroReg);

	if (DeleteDef)
	DefMI.eraseFromParent();

	return true;
	}

	static bool MBBDefinesCTR(MachineBasicBlock &MBB) {
	for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
	I != IE; ++I)
	if (I->definesRegister(PPC::CTR) \|\| I->definesRegister(PPC::CTR8))
	return true;
	return false;
	}

	// We should make sure that, if we're going to predicate both sides of a
	// condition (a diamond), that both sides don't define the counter register. We
	// can predicate counter-decrement-based branches, but while that predicates
	// the branching, it does not predicate the counter decrement. If we tried to
	// merge the triangle into one predicated block, we'd decrement the counter
	// twice.
	bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
	unsigned NumT, unsigned ExtraT,
	MachineBasicBlock &FMBB,
	unsigned NumF, unsigned ExtraF,
	BranchProbability Probability) const {
	return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB));
	}


	bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
	// The predicated branches are identified by their type, not really by the
	// explicit presence of a predicate. Furthermore, some of them can be
	// predicated more than once. Because if conversion won't try to predicate
	// any instruction which already claims to be predicated (by returning true
	// here), always return false. In doing so, we let isPredicable() be the
	// final word on whether not the instruction can be (further) predicated.

	return false;
	}

	bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
	if (!MI.isTerminator())
	return false;

	// Conditional branch is a special case.
	if (MI.isBranch() && !MI.isBarrier())
	return true;

	return !isPredicated(MI);
	}

	bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
	ArrayRef<MachineOperand> Pred) const {
	unsigned OpC = MI.getOpcode();
	if (OpC == PPC::BLR \|\| OpC == PPC::BLR8) {
	if (Pred[1].getReg() == PPC::CTR8 \|\| Pred[1].getReg() == PPC::CTR) {
	bool isPPC64 = Subtarget.isPPC64();
	MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR)
	: (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
	} else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
	MI.setDesc(get(PPC::BCLR));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
	} else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
	MI.setDesc(get(PPC::BCLRn));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
	} else {
	MI.setDesc(get(PPC::BCCLR));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.addImm(Pred[0].getImm())
	.add(Pred[1]);
	}

	return true;
	} else if (OpC == PPC::B) {
	if (Pred[1].getReg() == PPC::CTR8 \|\| Pred[1].getReg() == PPC::CTR) {
	bool isPPC64 = Subtarget.isPPC64();
	MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
	: (isPPC64 ? PPC::BDZ8 : PPC::BDZ)));
	} else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
	MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
	MI.RemoveOperand(0);

	MI.setDesc(get(PPC::BC));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.add(Pred[1])
	.addMBB(MBB);
	} else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
	MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
	MI.RemoveOperand(0);

	MI.setDesc(get(PPC::BCn));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.add(Pred[1])
	.addMBB(MBB);
	} else {
	MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
	MI.RemoveOperand(0);

	MI.setDesc(get(PPC::BCC));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.addImm(Pred[0].getImm())
	.add(Pred[1])
	.addMBB(MBB);
	}

	return true;
	} else if (OpC == PPC::BCTR \|\| OpC == PPC::BCTR8 \|\| OpC == PPC::BCTRL \|\|
	OpC == PPC::BCTRL8) {
	if (Pred[1].getReg() == PPC::CTR8 \|\| Pred[1].getReg() == PPC::CTR)
	llvm_unreachable("Cannot predicate bctr[l] on the ctr register");

	bool setLR = OpC == PPC::BCTRL \|\| OpC == PPC::BCTRL8;
	bool isPPC64 = Subtarget.isPPC64();

	if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
	MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
	: (setLR ? PPC::BCCTRL : PPC::BCCTR)));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
	return true;
	} else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
	MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
	: (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
	return true;
	}

	MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
	: (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.addImm(Pred[0].getImm())
	.add(Pred[1]);
	return true;
	}

	return false;
	}

	bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
	ArrayRef<MachineOperand> Pred2) const {
	assert(Pred1.size() == 2 && "Invalid PPC first predicate");
	assert(Pred2.size() == 2 && "Invalid PPC second predicate");

	if (Pred1[1].getReg() == PPC::CTR8 \|\| Pred1[1].getReg() == PPC::CTR)
	return false;
	if (Pred2[1].getReg() == PPC::CTR8 \|\| Pred2[1].getReg() == PPC::CTR)
	return false;

	// P1 can only subsume P2 if they test the same condition register.
	if (Pred1[1].getReg() != Pred2[1].getReg())
	return false;

	PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
	PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();

	if (P1 == P2)
	return true;

	// Does P1 subsume P2, e.g. GE subsumes GT.
	if (P1 == PPC::PRED_LE &&
	(P2 == PPC::PRED_LT \|\| P2 == PPC::PRED_EQ))
	return true;
	if (P1 == PPC::PRED_GE &&
	(P2 == PPC::PRED_GT \|\| P2 == PPC::PRED_EQ))
	return true;

	return false;
	}

	bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
	std::vector<MachineOperand> &Pred) const {
	// Note: At the present time, the contents of Pred from this function is
	// unused by IfConversion. This implementation follows ARM by pushing the
	// CR-defining operand. Because the 'DZ' and 'DNZ' count as types of
	// predicate, instructions defining CTR or CTR8 are also included as
	// predicate-defining instructions.

	const TargetRegisterClass *RCs[] =
	{ &PPC::CRRCRegClass, &PPC::CRBITRCRegClass,
	&PPC::CTRRCRegClass, &PPC::CTRRC8RegClass };

	bool Found = false;
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) {
	const TargetRegisterClass *RC = RCs[c];
	if (MO.isReg()) {
	if (MO.isDef() && RC->contains(MO.getReg())) {
	Pred.push_back(MO);
	Found = true;
	}
	} else if (MO.isRegMask()) {
	for (TargetRegisterClass::iterator I = RC->begin(),
	IE = RC->end(); I != IE; ++I)
	if (MO.clobbersPhysReg(*I)) {
	Pred.push_back(MO);
	Found = true;
	}
	}
	}
	}

	return Found;
	}

	bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &SrcReg2, int &Mask,
	int &Value) const {
	unsigned Opc = MI.getOpcode();

	switch (Opc) {
	default: return false;
	case PPC::CMPWI:
	case PPC::CMPLWI:
	case PPC::CMPDI:
	case PPC::CMPLDI:
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	Value = MI.getOperand(2).getImm();
	Mask = 0xFFFF;
	return true;
	case PPC::CMPW:
	case PPC::CMPLW:
	case PPC::CMPD:
	case PPC::CMPLD:
	case PPC::FCMPUS:
	case PPC::FCMPUD:
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = MI.getOperand(2).getReg();
	Value = 0;
	Mask = 0;
	return true;
	}
	}

	bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
	unsigned SrcReg2, int Mask, int Value,
	const MachineRegisterInfo *MRI) const {
	if (DisableCmpOpt)
	return false;

	int OpC = CmpInstr.getOpcode();
	Register CRReg = CmpInstr.getOperand(0).getReg();

	// FP record forms set CR1 based on the exception status bits, not a
	// comparison with zero.
	if (OpC == PPC::FCMPUS \|\| OpC == PPC::FCMPUD)
	return false;

	const TargetRegisterInfo *TRI = &getRegisterInfo();
	// The record forms set the condition register based on a signed comparison
	// with zero (so says the ISA manual). This is not as straightforward as it
	// seems, however, because this is always a 64-bit comparison on PPC64, even
	// for instructions that are 32-bit in nature (like slw for example).
	// So, on PPC32, for unsigned comparisons, we can use the record forms only
	// for equality checks (as those don't depend on the sign). On PPC64,
	// we are restricted to equality for unsigned 64-bit comparisons and for
	// signed 32-bit comparisons the applicability is more restricted.
	bool isPPC64 = Subtarget.isPPC64();
	bool is32BitSignedCompare = OpC == PPC::CMPWI \|\| OpC == PPC::CMPW;
	bool is32BitUnsignedCompare = OpC == PPC::CMPLWI \|\| OpC == PPC::CMPLW;
	bool is64BitUnsignedCompare = OpC == PPC::CMPLDI \|\| OpC == PPC::CMPLD;

	// Look through copies unless that gets us to a physical register.
	unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI);
	if (Register::isVirtualRegister(ActualSrc))
	SrcReg = ActualSrc;

	// Get the unique definition of SrcReg.
	MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
	if (!MI) return false;

	bool equalityOnly = false;
	bool noSub = false;
	if (isPPC64) {
	if (is32BitSignedCompare) {
	// We can perform this optimization only if MI is sign-extending.
	if (isSignExtended(*MI))
	noSub = true;
	else
	return false;
	} else if (is32BitUnsignedCompare) {
	// We can perform this optimization, equality only, if MI is
	// zero-extending.
	if (isZeroExtended(*MI)) {
	noSub = true;
	equalityOnly = true;
	} else
	return false;
	} else
	equalityOnly = is64BitUnsignedCompare;
	} else
	equalityOnly = is32BitUnsignedCompare;

	if (equalityOnly) {
	// We need to check the uses of the condition register in order to reject
	// non-equality comparisons.
	for (MachineRegisterInfo::use_instr_iterator
	I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
	I != IE; ++I) {
	MachineInstr UseMI = &I;
	if (UseMI->getOpcode() == PPC::BCC) {
	PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
	unsigned PredCond = PPC::getPredicateCondition(Pred);
	// We ignore hint bits when checking for non-equality comparisons.
	if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
	return false;
	} else if (UseMI->getOpcode() == PPC::ISEL \|\|
	UseMI->getOpcode() == PPC::ISEL8) {
	unsigned SubIdx = UseMI->getOperand(3).getSubReg();
	if (SubIdx != PPC::sub_eq)
	return false;
	} else
	return false;
	}
	}

	MachineBasicBlock::iterator I = CmpInstr;

	// Scan forward to find the first use of the compare.
	for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
	++I) {
	bool FoundUse = false;
	for (MachineRegisterInfo::use_instr_iterator
	J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end();
	J != JE; ++J)
	if (&J == &I) {
	FoundUse = true;
	break;
	}

	if (FoundUse)
	break;
	}

	SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
	SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;

	// There are two possible candidates which can be changed to set CR[01].
	// One is MI, the other is a SUB instruction.
	// For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
	MachineInstr *Sub = nullptr;
	if (SrcReg2 != 0)
	// MI is not a candidate for CMPrr.
	MI = nullptr;
	// FIXME: Conservatively refuse to convert an instruction which isn't in the
	// same BB as the comparison. This is to allow the check below to avoid calls
	// (and other explicit clobbers); instead we should really check for these
	// more explicitly (in at least a few predecessors).
	else if (MI->getParent() != CmpInstr.getParent())
	return false;
	else if (Value != 0) {
	// The record-form instructions set CR bit based on signed comparison
	// against 0. We try to convert a compare against 1 or -1 into a compare
	// against 0 to exploit record-form instructions. For example, we change
	// the condition "greater than -1" into "greater than or equal to 0"
	// and "less than 1" into "less than or equal to 0".

	// Since we optimize comparison based on a specific branch condition,
	// we don't optimize if condition code is used by more than once.
	if (equalityOnly \|\| !MRI->hasOneUse(CRReg))
	return false;

	MachineInstr UseMI = &MRI->use_instr_begin(CRReg);
	if (UseMI->getOpcode() != PPC::BCC)
	return false;

	PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
	unsigned PredCond = PPC::getPredicateCondition(Pred);
	unsigned PredHint = PPC::getPredicateHint(Pred);
	int16_t Immed = (int16_t)Value;

	// When modifying the condition in the predicate, we propagate hint bits
	// from the original predicate to the new one.
	if (Immed == -1 && PredCond == PPC::PRED_GT)
	// We convert "greater than -1" into "greater than or equal to 0",
	// since we are assuming signed comparison by !equalityOnly
	Pred = PPC::getPredicate(PPC::PRED_GE, PredHint);
	else if (Immed == -1 && PredCond == PPC::PRED_LE)
	// We convert "less than or equal to -1" into "less than 0".
	Pred = PPC::getPredicate(PPC::PRED_LT, PredHint);
	else if (Immed == 1 && PredCond == PPC::PRED_LT)
	// We convert "less than 1" into "less than or equal to 0".
	Pred = PPC::getPredicate(PPC::PRED_LE, PredHint);
	else if (Immed == 1 && PredCond == PPC::PRED_GE)
	// We convert "greater than or equal to 1" into "greater than 0".
	Pred = PPC::getPredicate(PPC::PRED_GT, PredHint);
	else
	return false;

	PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), Pred));
	}

	// Search for Sub.
	--I;

	// Get ready to iterate backward from CmpInstr.
	MachineBasicBlock::iterator E = MI, B = CmpInstr.getParent()->begin();

	for (; I != E && !noSub; --I) {
	const MachineInstr &Instr = *I;
	unsigned IOpC = Instr.getOpcode();

	if (&*I != &CmpInstr && (Instr.modifiesRegister(PPC::CR0, TRI) \|\|
	Instr.readsRegister(PPC::CR0, TRI)))
	// This instruction modifies or uses the record condition register after
	// the one we want to change. While we could do this transformation, it
	// would likely not be profitable. This transformation removes one
	// instruction, and so even forcing RA to generate one move probably
	// makes it unprofitable.
	return false;

	// Check whether CmpInstr can be made redundant by the current instruction.
	if ((OpC == PPC::CMPW \|\| OpC == PPC::CMPLW \|\|
	OpC == PPC::CMPD \|\| OpC == PPC::CMPLD) &&
	(IOpC == PPC::SUBF \|\| IOpC == PPC::SUBF8) &&
	((Instr.getOperand(1).getReg() == SrcReg &&
	Instr.getOperand(2).getReg() == SrcReg2) \|\|
	(Instr.getOperand(1).getReg() == SrcReg2 &&
	Instr.getOperand(2).getReg() == SrcReg))) {
	Sub = &*I;
	break;
	}

	if (I == B)
	// The 'and' is below the comparison instruction.
	return false;
	}

	// Return false if no candidates exist.
	if (!MI && !Sub)
	return false;

	// The single candidate is called MI.
	if (!MI) MI = Sub;

	int NewOpC = -1;
	int MIOpC = MI->getOpcode();
	if (MIOpC == PPC::ANDI_rec \|\| MIOpC == PPC::ANDI8_rec \|\|
	MIOpC == PPC::ANDIS_rec \|\| MIOpC == PPC::ANDIS8_rec)
	NewOpC = MIOpC;
	else {
	NewOpC = PPC::getRecordFormOpcode(MIOpC);
	if (NewOpC == -1 && PPC::getNonRecordFormOpcode(MIOpC) != -1)
	NewOpC = MIOpC;
	}

	// FIXME: On the non-embedded POWER architectures, only some of the record
	// forms are fast, and we should use only the fast ones.

	// The defining instruction has a record form (or is already a record
	// form). It is possible, however, that we'll need to reverse the condition
	// code of the users.
	if (NewOpC == -1)
	return false;

	// If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
	// needs to be updated to be based on SUB. Push the condition code
	// operands to OperandsToUpdate. If it is safe to remove CmpInstr, the
	// condition code of these operands will be modified.
	// Here, Value == 0 means we haven't converted comparison against 1 or -1 to
	// comparison against 0, which may modify predicate.
	bool ShouldSwap = false;
	if (Sub && Value == 0) {
	ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
	Sub->getOperand(2).getReg() == SrcReg;

	// The operands to subf are the opposite of sub, so only in the fixed-point
	// case, invert the order.
	ShouldSwap = !ShouldSwap;
	}

	if (ShouldSwap)
	for (MachineRegisterInfo::use_instr_iterator
	I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
	I != IE; ++I) {
	MachineInstr UseMI = &I;
	if (UseMI->getOpcode() == PPC::BCC) {
	PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
	unsigned PredCond = PPC::getPredicateCondition(Pred);
	assert((!equalityOnly \|\|
	PredCond == PPC::PRED_EQ \|\| PredCond == PPC::PRED_NE) &&
	"Invalid predicate for equality-only optimization");
	(void)PredCond; // To suppress warning in release build.
	PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
	PPC::getSwappedPredicate(Pred)));
	} else if (UseMI->getOpcode() == PPC::ISEL \|\|
	UseMI->getOpcode() == PPC::ISEL8) {
	unsigned NewSubReg = UseMI->getOperand(3).getSubReg();
	assert((!equalityOnly \|\| NewSubReg == PPC::sub_eq) &&
	"Invalid CR bit for equality-only optimization");

	if (NewSubReg == PPC::sub_lt)
	NewSubReg = PPC::sub_gt;
	else if (NewSubReg == PPC::sub_gt)
	NewSubReg = PPC::sub_lt;

	SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)),
	NewSubReg));
	} else // We need to abort on a user we don't understand.
	return false;
	}
	assert(!(Value != 0 && ShouldSwap) &&
	"Non-zero immediate support and ShouldSwap"
	"may conflict in updating predicate");

	// Create a new virtual register to hold the value of the CR set by the
	// record-form instruction. If the instruction was not previously in
	// record form, then set the kill flag on the CR.
	CmpInstr.eraseFromParent();

	MachineBasicBlock::iterator MII = MI;
	BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(),
	get(TargetOpcode::COPY), CRReg)
	.addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);

	// Even if CR0 register were dead before, it is alive now since the
	// instruction we just built uses it.
	MI->clearRegisterDeads(PPC::CR0);

	if (MIOpC != NewOpC) {
	// We need to be careful here: we're replacing one instruction with
	// another, and we need to make sure that we get all of the right
	// implicit uses and defs. On the other hand, the caller may be holding
	// an iterator to this instruction, and so we can't delete it (this is
	// specifically the case if this is the instruction directly after the
	// compare).

	// Rotates are expensive instructions. If we're emitting a record-form
	// rotate that can just be an andi/andis, we should just emit that.
	if (MIOpC == PPC::RLWINM \|\| MIOpC == PPC::RLWINM8) {
	Register GPRRes = MI->getOperand(0).getReg();
	int64_t SH = MI->getOperand(2).getImm();
	int64_t MB = MI->getOperand(3).getImm();
	int64_t ME = MI->getOperand(4).getImm();
	// We can only do this if both the start and end of the mask are in the
	// same halfword.
	bool MBInLoHWord = MB >= 16;
	bool MEInLoHWord = ME >= 16;
	uint64_t Mask = ~0LLU;

	if (MB <= ME && MBInLoHWord == MEInLoHWord && SH == 0) {
	Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
	// The mask value needs to shift right 16 if we're emitting andis.
	Mask >>= MBInLoHWord ? 0 : 16;
	NewOpC = MIOpC == PPC::RLWINM
	? (MBInLoHWord ? PPC::ANDI_rec : PPC::ANDIS_rec)
	: (MBInLoHWord ? PPC::ANDI8_rec : PPC::ANDIS8_rec);
	} else if (MRI->use_empty(GPRRes) && (ME == 31) &&
	(ME - MB + 1 == SH) && (MB >= 16)) {
	// If we are rotating by the exact number of bits as are in the mask
	// and the mask is in the least significant bits of the register,
	// that's just an andis. (as long as the GPR result has no uses).
	Mask = ((1LLU << 32) - 1) & ~((1LLU << (32 - SH)) - 1);
	Mask >>= 16;
	NewOpC = MIOpC == PPC::RLWINM ? PPC::ANDIS_rec : PPC::ANDIS8_rec;
	}
	// If we've set the mask, we can transform.
	if (Mask != ~0LLU) {
	MI->RemoveOperand(4);
	MI->RemoveOperand(3);
	MI->getOperand(2).setImm(Mask);
	NumRcRotatesConvertedToRcAnd++;
	}
	} else if (MIOpC == PPC::RLDICL && MI->getOperand(2).getImm() == 0) {
	int64_t MB = MI->getOperand(3).getImm();
	if (MB >= 48) {
	uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
	NewOpC = PPC::ANDI8_rec;
	MI->RemoveOperand(3);
	MI->getOperand(2).setImm(Mask);
	NumRcRotatesConvertedToRcAnd++;
	}
	}

	const MCInstrDesc &NewDesc = get(NewOpC);
	MI->setDesc(NewDesc);

	if (NewDesc.ImplicitDefs)
	for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
	*ImpDefs; ++ImpDefs)
	if (!MI->definesRegister(*ImpDefs))
	MI->addOperand(*MI->getParent()->getParent(),
	MachineOperand::CreateReg(*ImpDefs, true, true));
	if (NewDesc.ImplicitUses)
	for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
	*ImpUses; ++ImpUses)
	if (!MI->readsRegister(*ImpUses))
	MI->addOperand(*MI->getParent()->getParent(),
	MachineOperand::CreateReg(*ImpUses, false, true));
	}
	assert(MI->definesRegister(PPC::CR0) &&
	"Record-form instruction does not define cr0?");

	// Modify the condition code of operands in OperandsToUpdate.
	// Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
	// be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
	for (unsigned i = 0, e = PredsToUpdate.size(); i < e; i++)
	PredsToUpdate[i].first->setImm(PredsToUpdate[i].second);

	for (unsigned i = 0, e = SubRegsToUpdate.size(); i < e; i++)
	SubRegsToUpdate[i].first->setSubReg(SubRegsToUpdate[i].second);

	return true;
	}

	/// GetInstSize - Return the number of bytes of code the specified
	/// instruction may be. This returns the maximum number of bytes.
	///
	unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
	unsigned Opcode = MI.getOpcode();

	if (Opcode == PPC::INLINEASM \|\| Opcode == PPC::INLINEASM_BR) {
	const MachineFunction *MF = MI.getParent()->getParent();
	const char *AsmStr = MI.getOperand(0).getSymbolName();
	return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
	} else if (Opcode == TargetOpcode::STACKMAP) {
	StackMapOpers Opers(&MI);
	return Opers.getNumPatchBytes();
	} else if (Opcode == TargetOpcode::PATCHPOINT) {
	PatchPointOpers Opers(&MI);
	return Opers.getNumPatchBytes();
	} else {
	return get(Opcode).getSize();
	}
	}

	std::pair<unsigned, unsigned>
	PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
	const unsigned Mask = PPCII::MO_ACCESS_MASK;
	return std::make_pair(TF & Mask, TF & ~Mask);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
	using namespace PPCII;
	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_LO, "ppc-lo"},
	{MO_HA, "ppc-ha"},
	{MO_TPREL_LO, "ppc-tprel-lo"},
	{MO_TPREL_HA, "ppc-tprel-ha"},
	{MO_DTPREL_LO, "ppc-dtprel-lo"},
	{MO_TLSLD_LO, "ppc-tlsld-lo"},
	{MO_TOC_LO, "ppc-toc-lo"},
	{MO_TLS, "ppc-tls"}};
	return makeArrayRef(TargetFlags);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
	using namespace PPCII;
	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_PLT, "ppc-plt"},
	{MO_PIC_FLAG, "ppc-pic"},
	{MO_NLP_FLAG, "ppc-nlp"},
	{MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
	return makeArrayRef(TargetFlags);
	}

	// Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction.
	// The VSX versions have the advantage of a full 64-register target whereas
	// the FP ones have the advantage of lower latency and higher throughput. So
	// what we are after is using the faster instructions in low register pressure
	// situations and using the larger register file in high register pressure
	// situations.
	bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
	unsigned UpperOpcode, LowerOpcode;
	switch (MI.getOpcode()) {
	case PPC::DFLOADf32:
	UpperOpcode = PPC::LXSSP;
	LowerOpcode = PPC::LFS;
	break;
	case PPC::DFLOADf64:
	UpperOpcode = PPC::LXSD;
	LowerOpcode = PPC::LFD;
	break;
	case PPC::DFSTOREf32:
	UpperOpcode = PPC::STXSSP;
	LowerOpcode = PPC::STFS;
	break;
	case PPC::DFSTOREf64:
	UpperOpcode = PPC::STXSD;
	LowerOpcode = PPC::STFD;
	break;
	case PPC::XFLOADf32:
	UpperOpcode = PPC::LXSSPX;
	LowerOpcode = PPC::LFSX;
	break;
	case PPC::XFLOADf64:
	UpperOpcode = PPC::LXSDX;
	LowerOpcode = PPC::LFDX;
	break;
	case PPC::XFSTOREf32:
	UpperOpcode = PPC::STXSSPX;
	LowerOpcode = PPC::STFSX;
	break;
	case PPC::XFSTOREf64:
	UpperOpcode = PPC::STXSDX;
	LowerOpcode = PPC::STFDX;
	break;
	case PPC::LIWAX:
	UpperOpcode = PPC::LXSIWAX;
	LowerOpcode = PPC::LFIWAX;
	break;
	case PPC::LIWZX:
	UpperOpcode = PPC::LXSIWZX;
	LowerOpcode = PPC::LFIWZX;
	break;
	case PPC::STIWX:
	UpperOpcode = PPC::STXSIWX;
	LowerOpcode = PPC::STFIWX;
	break;
	default:
	llvm_unreachable("Unknown Operation!");
	}

	Register TargetReg = MI.getOperand(0).getReg();
	unsigned Opcode;
	if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) \|\|
	(TargetReg >= PPC::VSL0 && TargetReg <= PPC::VSL31))
	Opcode = LowerOpcode;
	else
	Opcode = UpperOpcode;
	MI.setDesc(get(Opcode));
	return true;
	}

	static bool isAnImmediateOperand(const MachineOperand &MO) {
	return MO.isCPI() \|\| MO.isGlobal() \|\| MO.isImm();
	}

	bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
	auto &MBB = *MI.getParent();
	auto DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	case TargetOpcode::LOAD_STACK_GUARD: {
	assert(Subtarget.isTargetLinux() &&
	"Only Linux target is expected to contain LOAD_STACK_GUARD");
	const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
	const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
	MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.addImm(Offset)
	.addReg(Reg);
	return true;
	}
	case PPC::DFLOADf32:
	case PPC::DFLOADf64:
	case PPC::DFSTOREf32:
	case PPC::DFSTOREf64: {
	assert(Subtarget.hasP9Vector() &&
	"Invalid D-Form Pseudo-ops on Pre-P9 target.");
	assert(MI.getOperand(2).isReg() &&
	isAnImmediateOperand(MI.getOperand(1)) &&
	"D-form op must have register and immediate operands");
	return expandVSXMemPseudo(MI);
	}
	case PPC::XFLOADf32:
	case PPC::XFSTOREf32:
	case PPC::LIWAX:
	case PPC::LIWZX:
	case PPC::STIWX: {
	assert(Subtarget.hasP8Vector() &&
	"Invalid X-Form Pseudo-ops on Pre-P8 target.");
	assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() &&
	"X-form op must have register and register operands");
	return expandVSXMemPseudo(MI);
	}
	case PPC::XFLOADf64:
	case PPC::XFSTOREf64: {
	assert(Subtarget.hasVSX() &&
	"Invalid X-Form Pseudo-ops on target that has no VSX.");
	assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() &&
	"X-form op must have register and register operands");
	return expandVSXMemPseudo(MI);
	}
	case PPC::SPILLTOVSR_LD: {
	Register TargetReg = MI.getOperand(0).getReg();
	if (PPC::VSFRCRegClass.contains(TargetReg)) {
	MI.setDesc(get(PPC::DFLOADf64));
	return expandPostRAPseudo(MI);
	}
	else
	MI.setDesc(get(PPC::LD));
	return true;
	}
	case PPC::SPILLTOVSR_ST: {
	Register SrcReg = MI.getOperand(0).getReg();
	if (PPC::VSFRCRegClass.contains(SrcReg)) {
	NumStoreSPILLVSRRCAsVec++;
	MI.setDesc(get(PPC::DFSTOREf64));
	return expandPostRAPseudo(MI);
	} else {
	NumStoreSPILLVSRRCAsGpr++;
	MI.setDesc(get(PPC::STD));
	}
	return true;
	}
	case PPC::SPILLTOVSR_LDX: {
	Register TargetReg = MI.getOperand(0).getReg();
	if (PPC::VSFRCRegClass.contains(TargetReg))
	MI.setDesc(get(PPC::LXSDX));
	else
	MI.setDesc(get(PPC::LDX));
	return true;
	}
	case PPC::SPILLTOVSR_STX: {
	Register SrcReg = MI.getOperand(0).getReg();
	if (PPC::VSFRCRegClass.contains(SrcReg)) {
	NumStoreSPILLVSRRCAsVec++;
	MI.setDesc(get(PPC::STXSDX));
	} else {
	NumStoreSPILLVSRRCAsGpr++;
	MI.setDesc(get(PPC::STDX));
	}
	return true;
	}

	case PPC::CFENCE8: {
	auto Val = MI.getOperand(0).getReg();
	BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);
	BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP))
	.addImm(PPC::PRED_NE_MINUS)
	.addReg(PPC::CR7)
	.addImm(1);
	MI.setDesc(get(PPC::ISYNC));
	MI.RemoveOperand(0);
	return true;
	}
	}
	return false;
	}

	// Essentially a compile-time implementation of a compare->isel sequence.
	// It takes two constants to compare, along with the true/false registers
	// and the comparison type (as a subreg to a CR field) and returns one
	// of the true/false registers, depending on the comparison results.
	static unsigned selectReg(int64_t Imm1, int64_t Imm2, unsigned CompareOpc,
	unsigned TrueReg, unsigned FalseReg,
	unsigned CRSubReg) {
	// Signed comparisons. The immediates are assumed to be sign-extended.
	if (CompareOpc == PPC::CMPWI \|\| CompareOpc == PPC::CMPDI) {
	switch (CRSubReg) {
	default: llvm_unreachable("Unknown integer comparison type.");
	case PPC::sub_lt:
	return Imm1 < Imm2 ? TrueReg : FalseReg;
	case PPC::sub_gt:
	return Imm1 > Imm2 ? TrueReg : FalseReg;
	case PPC::sub_eq:
	return Imm1 == Imm2 ? TrueReg : FalseReg;
	}
	}
	// Unsigned comparisons.
	else if (CompareOpc == PPC::CMPLWI \|\| CompareOpc == PPC::CMPLDI) {
	switch (CRSubReg) {
	default: llvm_unreachable("Unknown integer comparison type.");
	case PPC::sub_lt:
	return (uint64_t)Imm1 < (uint64_t)Imm2 ? TrueReg : FalseReg;
	case PPC::sub_gt:
	return (uint64_t)Imm1 > (uint64_t)Imm2 ? TrueReg : FalseReg;
	case PPC::sub_eq:
	return Imm1 == Imm2 ? TrueReg : FalseReg;
	}
	}
	return PPC::NoRegister;
	}

	void PPCInstrInfo::replaceInstrOperandWithImm(MachineInstr &MI,
	unsigned OpNo,
	int64_t Imm) const {
	assert(MI.getOperand(OpNo).isReg() && "Operand must be a REG");
	// Replace the REG with the Immediate.
	Register InUseReg = MI.getOperand(OpNo).getReg();
	MI.getOperand(OpNo).ChangeToImmediate(Imm);

	if (MI.implicit_operands().empty())
	return;

	// We need to make sure that the MI didn't have any implicit use
	// of this REG any more.
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	int UseOpIdx = MI.findRegisterUseOperandIdx(InUseReg, false, TRI);
	if (UseOpIdx >= 0) {
	MachineOperand &MO = MI.getOperand(UseOpIdx);
	if (MO.isImplicit())
	// The operands must always be in the following order:
	// - explicit reg defs,
	// - other explicit operands (reg uses, immediates, etc.),
	// - implicit reg defs
	// - implicit reg uses
	// Therefore, removing the implicit operand won't change the explicit
	// operands layout.
	MI.RemoveOperand(UseOpIdx);
	}
	}

	// Replace an instruction with one that materializes a constant (and sets
	// CR0 if the original instruction was a record-form instruction).
	void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
	const LoadImmediateInfo &LII) const {
	// Remove existing operands.
	int OperandToKeep = LII.SetCR ? 1 : 0;
	for (int i = MI.getNumOperands() - 1; i > OperandToKeep; i--)
	MI.RemoveOperand(i);

	// Replace the instruction.
	if (LII.SetCR) {
	MI.setDesc(get(LII.Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
	// Set the immediate.
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.addImm(LII.Imm).addReg(PPC::CR0, RegState::ImplicitDefine);
	return;
	}
	else
	MI.setDesc(get(LII.Is64Bit ? PPC::LI8 : PPC::LI));

	// Set the immediate.
	MachineInstrBuilder(*MI.getParent()->getParent(), MI)
	.addImm(LII.Imm);
	}

	MachineInstr *PPCInstrInfo::getDefMIPostRA(unsigned Reg, MachineInstr &MI,
	bool &SeenIntermediateUse) const {
	assert(!MI.getParent()->getParent()->getRegInfo().isSSA() &&
	"Should be called after register allocation.");
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
	It++;
	SeenIntermediateUse = false;
	for (; It != E; ++It) {
	if (It->modifiesRegister(Reg, TRI))
	return &*It;
	if (It->readsRegister(Reg, TRI))
	SeenIntermediateUse = true;
	}
	return nullptr;
	}

	MachineInstr *PPCInstrInfo::getForwardingDefMI(
	MachineInstr &MI,
	unsigned &OpNoForForwarding,
	bool &SeenIntermediateUse) const {
	OpNoForForwarding = ~0U;
	MachineInstr *DefMI = nullptr;
	MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	// If we're in SSA, get the defs through the MRI. Otherwise, only look
	// within the basic block to see if the register is defined using an LI/LI8.
	if (MRI->isSSA()) {
	for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
	if (!MI.getOperand(i).isReg())
	continue;
	Register Reg = MI.getOperand(i).getReg();
	if (!Register::isVirtualRegister(Reg))
	continue;
	unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
	if (Register::isVirtualRegister(TrueReg)) {
	DefMI = MRI->getVRegDef(TrueReg);
	if (DefMI->getOpcode() == PPC::LI \|\| DefMI->getOpcode() == PPC::LI8) {
	OpNoForForwarding = i;
	break;
	}
	}
	}
	} else {
	// Looking back through the definition for each operand could be expensive,
	// so exit early if this isn't an instruction that either has an immediate
	// form or is already an immediate form that we can handle.
	ImmInstrInfo III;
	unsigned Opc = MI.getOpcode();
	bool ConvertibleImmForm =
	Opc == PPC::CMPWI \|\| Opc == PPC::CMPLWI \|\| Opc == PPC::CMPDI \|\|
	Opc == PPC::CMPLDI \|\| Opc == PPC::ADDI \|\| Opc == PPC::ADDI8 \|\|
	Opc == PPC::ORI \|\| Opc == PPC::ORI8 \|\| Opc == PPC::XORI \|\|
	Opc == PPC::XORI8 \|\| Opc == PPC::RLDICL \|\| Opc == PPC::RLDICL_rec \|\|
	Opc == PPC::RLDICL_32 \|\| Opc == PPC::RLDICL_32_64 \|\|
	Opc == PPC::RLWINM \|\| Opc == PPC::RLWINM_rec \|\| Opc == PPC::RLWINM8 \|\|
	Opc == PPC::RLWINM8_rec;
	bool IsVFReg = (MI.getNumOperands() && MI.getOperand(0).isReg())
	? isVFRegister(MI.getOperand(0).getReg())
	: false;
	if (!ConvertibleImmForm && !instrHasImmForm(Opc, IsVFReg, III, true))
	return nullptr;

	// Don't convert or %X, %Y, %Y since that's just a register move.
	if ((Opc == PPC::OR \|\| Opc == PPC::OR8) &&
	MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
	return nullptr;
	for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
	MachineOperand &MO = MI.getOperand(i);
	SeenIntermediateUse = false;
	if (MO.isReg() && MO.isUse() && !MO.isImplicit()) {
	Register Reg = MI.getOperand(i).getReg();
	// If we see another use of this reg between the def and the MI,
	// we want to flat it so the def isn't deleted.
	MachineInstr *DefMI = getDefMIPostRA(Reg, MI, SeenIntermediateUse);
	if (DefMI) {
	// Is this register defined by some form of add-immediate (including
	// load-immediate) within this basic block?
	switch (DefMI->getOpcode()) {
	default:
	break;
	case PPC::LI:
	case PPC::LI8:
	case PPC::ADDItocL:
	case PPC::ADDI:
	case PPC::ADDI8:
	OpNoForForwarding = i;
	return DefMI;
	}
	}
	}
	}
	}
	return OpNoForForwarding == ~0U ? nullptr : DefMI;
	}

	const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
	static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
	// Power 8
	{PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
	PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,
	PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
	PPC::SPILLTOVSR_ST, PPC::EVSTDD},
	// Power 9
	{PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
	PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,
	PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
	PPC::SPILLTOVSR_ST}};

	return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
	}

	const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
	static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
	// Power 8
	{PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
	PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,
	PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
	PPC::SPILLTOVSR_LD, PPC::EVLDD},
	// Power 9
	{PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
	PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32,
	PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
	PPC::SPILLTOVSR_LD}};

	return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
	}

	void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
	unsigned RegNo) const {
	const MachineRegisterInfo &MRI =
	StartMI.getParent()->getParent()->getRegInfo();
	if (MRI.isSSA())
	return;

	// Instructions between [StartMI, EndMI] should be in same basic block.
	assert((StartMI.getParent() == EndMI.getParent()) &&
	"Instructions are not in same basic block");

	bool IsKillSet = false;

	auto clearOperandKillInfo = [=] (MachineInstr &MI, unsigned Index) {
	MachineOperand &MO = MI.getOperand(Index);
	if (MO.isReg() && MO.isUse() && MO.isKill() &&
	getRegisterInfo().regsOverlap(MO.getReg(), RegNo))
	MO.setIsKill(false);
	};

	// Set killed flag for EndMI.
	// No need to do anything if EndMI defines RegNo.
	int UseIndex =
	EndMI.findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo());
	if (UseIndex != -1) {
	EndMI.getOperand(UseIndex).setIsKill(true);
	IsKillSet = true;
	// Clear killed flag for other EndMI operands related to RegNo. In some
	// upexpected cases, killed may be set multiple times for same register
	// operand in same MI.
	for (int i = 0, e = EndMI.getNumOperands(); i != e; ++i)
	if (i != UseIndex)
	clearOperandKillInfo(EndMI, i);
	}

	// Walking the inst in reverse order (EndMI -> StartMI].
	MachineBasicBlock::reverse_iterator It = EndMI;
	MachineBasicBlock::reverse_iterator E = EndMI.getParent()->rend();
	// EndMI has been handled above, skip it here.
	It++;
	MachineOperand *MO = nullptr;
	for (; It != E; ++It) {
	// Skip insturctions which could not be a def/use of RegNo.
	if (It->isDebugInstr() \|\| It->isPosition())
	continue;

	// Clear killed flag for all It operands related to RegNo. In some
	// upexpected cases, killed may be set multiple times for same register
	// operand in same MI.
	for (int i = 0, e = It->getNumOperands(); i != e; ++i)
	clearOperandKillInfo(*It, i);

	// If killed is not set, set killed for its last use or set dead for its def
	// if no use found.
	if (!IsKillSet) {
	if ((MO = It->findRegisterUseOperand(RegNo, false, &getRegisterInfo()))) {
	// Use found, set it killed.
	IsKillSet = true;
	MO->setIsKill(true);
	continue;
	} else if ((MO = It->findRegisterDefOperand(RegNo, false, true,
	&getRegisterInfo()))) {
	// No use found, set dead for its def.
	assert(&*It == &StartMI && "No new def between StartMI and EndMI.");
	MO->setIsDead(true);
	break;
	}
	}

	if ((&*It) == &StartMI)
	break;
	}
	// Ensure RegMo liveness is killed after EndMI.
	assert((IsKillSet \|\| (MO && MO->isDead())) &&
	"RegNo should be killed or dead");
	}

	// This opt tries to convert the following imm form to an index form to save an
	// add for stack variables.
	// Return false if no such pattern found.
	//
	// ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, OffsetAddi
	// ADD instr: ToBeDeletedReg = ADD ToBeChangedReg(killed), ScaleReg
	// Imm instr: Reg = op OffsetImm, ToBeDeletedReg(killed)
	//
	// can be converted to:
	//
	// new ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, (OffsetAddi + OffsetImm)
	// Index instr: Reg = opx ScaleReg, ToBeChangedReg(killed)
	//
	// In order to eliminate ADD instr, make sure that:
	// 1: (OffsetAddi + OffsetImm) must be int16 since this offset will be used in
	// new ADDI instr and ADDI can only take int16 Imm.
	// 2: ToBeChangedReg must be killed in ADD instr and there is no other use
	// between ADDI and ADD instr since its original def in ADDI will be changed
	// in new ADDI instr. And also there should be no new def for it between
	// ADD and Imm instr as ToBeChangedReg will be used in Index instr.
	// 3: ToBeDeletedReg must be killed in Imm instr and there is no other use
	// between ADD and Imm instr since ADD instr will be eliminated.
	// 4: ScaleReg must not be redefined between ADD and Imm instr since it will be
	// moved to Index instr.
	bool PPCInstrInfo::foldFrameOffset(MachineInstr &MI) const {
	MachineFunction *MF = MI.getParent()->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	bool PostRA = !MRI->isSSA();
	// Do this opt after PEI which is after RA. The reason is stack slot expansion
	// in PEI may expose such opportunities since in PEI, stack slot offsets to
	// frame base(OffsetAddi) are determined.
	if (!PostRA)
	return false;
	unsigned ToBeDeletedReg = 0;
	int64_t OffsetImm = 0;
	unsigned XFormOpcode = 0;
	ImmInstrInfo III;

	// Check if Imm instr meets requirement.
	if (!isImmInstrEligibleForFolding(MI, ToBeDeletedReg, XFormOpcode, OffsetImm,
	III))
	return false;

	bool OtherIntermediateUse = false;
	MachineInstr *ADDMI = getDefMIPostRA(ToBeDeletedReg, MI, OtherIntermediateUse);

	// Exit if there is other use between ADD and Imm instr or no def found.
	if (OtherIntermediateUse \|\| !ADDMI)
	return false;

	// Check if ADD instr meets requirement.
	if (!isADDInstrEligibleForFolding(*ADDMI))
	return false;

	unsigned ScaleRegIdx = 0;
	int64_t OffsetAddi = 0;
	MachineInstr *ADDIMI = nullptr;

	// Check if there is a valid ToBeChangedReg in ADDMI.
	// 1: It must be killed.
	// 2: Its definition must be a valid ADDIMI.
	// 3: It must satify int16 offset requirement.
	if (isValidToBeChangedReg(ADDMI, 1, ADDIMI, OffsetAddi, OffsetImm))
	ScaleRegIdx = 2;
	else if (isValidToBeChangedReg(ADDMI, 2, ADDIMI, OffsetAddi, OffsetImm))
	ScaleRegIdx = 1;
	else
	return false;

	assert(ADDIMI && "There should be ADDIMI for valid ToBeChangedReg.");
	unsigned ToBeChangedReg = ADDIMI->getOperand(0).getReg();
	unsigned ScaleReg = ADDMI->getOperand(ScaleRegIdx).getReg();
	auto NewDefFor = [&](unsigned Reg, MachineBasicBlock::iterator Start,
	MachineBasicBlock::iterator End) {
	for (auto It = ++Start; It != End; It++)
	if (It->modifiesRegister(Reg, &getRegisterInfo()))
	return true;
	return false;
	};
	// Make sure no other def for ToBeChangedReg and ScaleReg between ADD Instr
	// and Imm Instr.
	if (NewDefFor(ToBeChangedReg, ADDMI, MI) \|\| NewDefFor(ScaleReg, ADDMI, MI))
	return false;

	// Now start to do the transformation.
	LLVM_DEBUG(dbgs() << "Replace instruction: "
	<< "\n");
	LLVM_DEBUG(ADDIMI->dump());
	LLVM_DEBUG(ADDMI->dump());
	LLVM_DEBUG(MI.dump());
	LLVM_DEBUG(dbgs() << "with: "
	<< "\n");

	// Update ADDI instr.
	ADDIMI->getOperand(2).setImm(OffsetAddi + OffsetImm);

	// Update Imm instr.
	MI.setDesc(get(XFormOpcode));
	MI.getOperand(III.ImmOpNo)
	.ChangeToRegister(ScaleReg, false, false,
	ADDMI->getOperand(ScaleRegIdx).isKill());

	MI.getOperand(III.OpNoForForwarding)
	.ChangeToRegister(ToBeChangedReg, false, false, true);

	// Eliminate ADD instr.
	ADDMI->eraseFromParent();

	LLVM_DEBUG(ADDIMI->dump());
	LLVM_DEBUG(MI.dump());

	return true;
	}

	bool PPCInstrInfo::isADDIInstrEligibleForFolding(MachineInstr &ADDIMI,
	int64_t &Imm) const {
	unsigned Opc = ADDIMI.getOpcode();

	// Exit if the instruction is not ADDI.
	if (Opc != PPC::ADDI && Opc != PPC::ADDI8)
	return false;

	+ // The operand may not necessarily be an immediate - it could be a relocation.
	+ if (!ADDIMI.getOperand(2).isImm())
	+ return false;
	+
	Imm = ADDIMI.getOperand(2).getImm();

	return true;
	}

	bool PPCInstrInfo::isADDInstrEligibleForFolding(MachineInstr &ADDMI) const {
	unsigned Opc = ADDMI.getOpcode();

	// Exit if the instruction is not ADD.
	return Opc == PPC::ADD4 \|\| Opc == PPC::ADD8;
	}

	bool PPCInstrInfo::isImmInstrEligibleForFolding(MachineInstr &MI,
	unsigned &ToBeDeletedReg,
	unsigned &XFormOpcode,
	int64_t &OffsetImm,
	ImmInstrInfo &III) const {
	// Only handle load/store.
	if (!MI.mayLoadOrStore())
	return false;

	unsigned Opc = MI.getOpcode();

	XFormOpcode = RI.getMappedIdxOpcForImmOpc(Opc);

	// Exit if instruction has no index form.
	if (XFormOpcode == PPC::INSTRUCTION_LIST_END)
	return false;

	// TODO: sync the logic between instrHasImmForm() and ImmToIdxMap.
	if (!instrHasImmForm(XFormOpcode, isVFRegister(MI.getOperand(0).getReg()),
	III, true))
	return false;

	if (!III.IsSummingOperands)
	return false;

	MachineOperand ImmOperand = MI.getOperand(III.ImmOpNo);
	MachineOperand RegOperand = MI.getOperand(III.OpNoForForwarding);
	// Only support imm operands, not relocation slots or others.
	if (!ImmOperand.isImm())
	return false;

	assert(RegOperand.isReg() && "Instruction format is not right");

	// There are other use for ToBeDeletedReg after Imm instr, can not delete it.
	if (!RegOperand.isKill())
	return false;

	ToBeDeletedReg = RegOperand.getReg();
	OffsetImm = ImmOperand.getImm();

	return true;
	}

	bool PPCInstrInfo::isValidToBeChangedReg(MachineInstr *ADDMI, unsigned Index,
	MachineInstr *&ADDIMI,
	int64_t &OffsetAddi,
	int64_t OffsetImm) const {
	assert((Index == 1 \|\| Index == 2) && "Invalid operand index for add.");
	MachineOperand &MO = ADDMI->getOperand(Index);

	if (!MO.isKill())
	return false;

	bool OtherIntermediateUse = false;

	ADDIMI = getDefMIPostRA(MO.getReg(), *ADDMI, OtherIntermediateUse);
	// Currently handle only one "add + Imminstr" pair case, exit if other
	// intermediate use for ToBeChangedReg found.
	// TODO: handle the cases where there are other "add + Imminstr" pairs
	// with same offset in Imminstr which is like:
	//
	// ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, OffsetAddi
	// ADD instr1: ToBeDeletedReg1 = ADD ToBeChangedReg, ScaleReg1
	// Imm instr1: Reg1 = op1 OffsetImm, ToBeDeletedReg1(killed)
	// ADD instr2: ToBeDeletedReg2 = ADD ToBeChangedReg(killed), ScaleReg2
	// Imm instr2: Reg2 = op2 OffsetImm, ToBeDeletedReg2(killed)
	//
	// can be converted to:
	//
	// new ADDI instr: ToBeChangedReg = ADDI FrameBaseReg,
	// (OffsetAddi + OffsetImm)
	// Index instr1: Reg1 = opx1 ScaleReg1, ToBeChangedReg
	// Index instr2: Reg2 = opx2 ScaleReg2, ToBeChangedReg(killed)

	if (OtherIntermediateUse \|\| !ADDIMI)
	return false;
	// Check if ADDI instr meets requirement.
	if (!isADDIInstrEligibleForFolding(*ADDIMI, OffsetAddi))
	return false;

	if (isInt<16>(OffsetAddi + OffsetImm))
	return true;
	return false;
	}

	// If this instruction has an immediate form and one of its operands is a
	// result of a load-immediate or an add-immediate, convert it to
	// the immediate form if the constant is in range.
	bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
	MachineInstr **KilledDef) const {
	MachineFunction *MF = MI.getParent()->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	bool PostRA = !MRI->isSSA();
	bool SeenIntermediateUse = true;
	unsigned ForwardingOperand = ~0U;
	MachineInstr *DefMI = getForwardingDefMI(MI, ForwardingOperand,
	SeenIntermediateUse);
	if (!DefMI)
	return false;
	assert(ForwardingOperand < MI.getNumOperands() &&
	"The forwarding operand needs to be valid at this point");
	bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill();
	bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled;
	Register ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
	if (KilledDef && KillFwdDefMI)
	*KilledDef = DefMI;

	ImmInstrInfo III;
	bool IsVFReg = MI.getOperand(0).isReg()
	? isVFRegister(MI.getOperand(0).getReg())
	: false;
	bool HasImmForm = instrHasImmForm(MI.getOpcode(), IsVFReg, III, PostRA);
	// If this is a reg+reg instruction that has a reg+imm form,
	// and one of the operands is produced by an add-immediate,
	// try to convert it.
	if (HasImmForm &&
	transformToImmFormFedByAdd(MI, III, ForwardingOperand, *DefMI,
	KillFwdDefMI))
	return true;

	if ((DefMI->getOpcode() != PPC::LI && DefMI->getOpcode() != PPC::LI8) \|\|
	!DefMI->getOperand(1).isImm())
	return false;

	int64_t Immediate = DefMI->getOperand(1).getImm();
	// Sign-extend to 64-bits.
	int64_t SExtImm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
	(Immediate \| 0xFFFFFFFFFFFF0000) : Immediate;

	// If this is a reg+reg instruction that has a reg+imm form,
	// and one of the operands is produced by LI, convert it now.
	if (HasImmForm)
	return transformToImmFormFedByLI(MI, III, ForwardingOperand, *DefMI, SExtImm);

	bool ReplaceWithLI = false;
	bool Is64BitLI = false;
	int64_t NewImm = 0;
	bool SetCR = false;
	unsigned Opc = MI.getOpcode();
	switch (Opc) {
	default: return false;

	// FIXME: Any branches conditional on such a comparison can be made
	// unconditional. At this time, this happens too infrequently to be worth
	// the implementation effort, but if that ever changes, we could convert
	// such a pattern here.
	case PPC::CMPWI:
	case PPC::CMPLWI:
	case PPC::CMPDI:
	case PPC::CMPLDI: {
	// Doing this post-RA would require dataflow analysis to reliably find uses
	// of the CR register set by the compare.
	// No need to fixup killed/dead flag since this transformation is only valid
	// before RA.
	if (PostRA)
	return false;
	// If a compare-immediate is fed by an immediate and is itself an input of
	// an ISEL (the most common case) into a COPY of the correct register.
	bool Changed = false;
	Register DefReg = MI.getOperand(0).getReg();
	int64_t Comparand = MI.getOperand(2).getImm();
	int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ?
	(Comparand \| 0xFFFFFFFFFFFF0000) : Comparand;

	for (auto &CompareUseMI : MRI->use_instructions(DefReg)) {
	unsigned UseOpc = CompareUseMI.getOpcode();
	if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
	continue;
	unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
	Register TrueReg = CompareUseMI.getOperand(1).getReg();
	Register FalseReg = CompareUseMI.getOperand(2).getReg();
	unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg,
	FalseReg, CRSubReg);
	if (RegToCopy == PPC::NoRegister)
	continue;
	// Can't use PPC::COPY to copy PPC::ZERO[8]. Convert it to LI[8] 0.
	if (RegToCopy == PPC::ZERO \|\| RegToCopy == PPC::ZERO8) {
	CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI));
	replaceInstrOperandWithImm(CompareUseMI, 1, 0);
	CompareUseMI.RemoveOperand(3);
	CompareUseMI.RemoveOperand(2);
	continue;
	}
	LLVM_DEBUG(
	dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
	LLVM_DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
	LLVM_DEBUG(dbgs() << "Is converted to:\n");
	// Convert to copy and remove unneeded operands.
	CompareUseMI.setDesc(get(PPC::COPY));
	CompareUseMI.RemoveOperand(3);
	CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
	CmpIselsConverted++;
	Changed = true;
	LLVM_DEBUG(CompareUseMI.dump());
	}
	if (Changed)
	return true;
	// This may end up incremented multiple times since this function is called
	// during a fixed-point transformation, but it is only meant to indicate the
	// presence of this opportunity.
	MissedConvertibleImmediateInstrs++;
	return false;
	}

	// Immediate forms - may simply be convertable to an LI.
	case PPC::ADDI:
	case PPC::ADDI8: {
	// Does the sum fit in a 16-bit signed field?
	int64_t Addend = MI.getOperand(2).getImm();
	if (isInt<16>(Addend + SExtImm)) {
	ReplaceWithLI = true;
	Is64BitLI = Opc == PPC::ADDI8;
	NewImm = Addend + SExtImm;
	break;
	}
	return false;
	}
	case PPC::RLDICL:
	case PPC::RLDICL_rec:
	case PPC::RLDICL_32:
	case PPC::RLDICL_32_64: {
	// Use APInt's rotate function.
	int64_t SH = MI.getOperand(2).getImm();
	int64_t MB = MI.getOperand(3).getImm();
	APInt InVal((Opc == PPC::RLDICL \|\| Opc == PPC::RLDICL_rec) ? 64 : 32,
	SExtImm, true);
	InVal = InVal.rotl(SH);
	uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
	InVal &= Mask;
	// Can't replace negative values with an LI as that will sign-extend
	// and not clear the left bits. If we're setting the CR bit, we will use
	// ANDI_rec which won't sign extend, so that's safe.
	if (isUInt<15>(InVal.getSExtValue()) \|\|
	(Opc == PPC::RLDICL_rec && isUInt<16>(InVal.getSExtValue()))) {
	ReplaceWithLI = true;
	Is64BitLI = Opc != PPC::RLDICL_32;
	NewImm = InVal.getSExtValue();
	SetCR = Opc == PPC::RLDICL_rec;
	break;
	}
	return false;
	}
	case PPC::RLWINM:
	case PPC::RLWINM8:
	case PPC::RLWINM_rec:
	case PPC::RLWINM8_rec: {
	int64_t SH = MI.getOperand(2).getImm();
	int64_t MB = MI.getOperand(3).getImm();
	int64_t ME = MI.getOperand(4).getImm();
	APInt InVal(32, SExtImm, true);
	InVal = InVal.rotl(SH);
	// Set the bits ( MB + 32 ) to ( ME + 32 ).
	uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
	InVal &= Mask;
	// Can't replace negative values with an LI as that will sign-extend
	// and not clear the left bits. If we're setting the CR bit, we will use
	// ANDI_rec which won't sign extend, so that's safe.
	bool ValueFits = isUInt<15>(InVal.getSExtValue());
	ValueFits \|= ((Opc == PPC::RLWINM_rec \|\| Opc == PPC::RLWINM8_rec) &&
	isUInt<16>(InVal.getSExtValue()));
	if (ValueFits) {
	ReplaceWithLI = true;
	Is64BitLI = Opc == PPC::RLWINM8 \|\| Opc == PPC::RLWINM8_rec;
	NewImm = InVal.getSExtValue();
	SetCR = Opc == PPC::RLWINM_rec \|\| Opc == PPC::RLWINM8_rec;
	break;
	}
	return false;
	}
	case PPC::ORI:
	case PPC::ORI8:
	case PPC::XORI:
	case PPC::XORI8: {
	int64_t LogicalImm = MI.getOperand(2).getImm();
	int64_t Result = 0;
	if (Opc == PPC::ORI \|\| Opc == PPC::ORI8)
	Result = LogicalImm \| SExtImm;
	else
	Result = LogicalImm ^ SExtImm;
	if (isInt<16>(Result)) {
	ReplaceWithLI = true;
	Is64BitLI = Opc == PPC::ORI8 \|\| Opc == PPC::XORI8;
	NewImm = Result;
	break;
	}
	return false;
	}
	}

	if (ReplaceWithLI) {
	// We need to be careful with CR-setting instructions we're replacing.
	if (SetCR) {
	// We don't know anything about uses when we're out of SSA, so only
	// replace if the new immediate will be reproduced.
	bool ImmChanged = (SExtImm & NewImm) != NewImm;
	if (PostRA && ImmChanged)
	return false;

	if (!PostRA) {
	// If the defining load-immediate has no other uses, we can just replace
	// the immediate with the new immediate.
	if (MRI->hasOneUse(DefMI->getOperand(0).getReg()))
	DefMI->getOperand(1).setImm(NewImm);

	// If we're not using the GPR result of the CR-setting instruction, we
	// just need to and with zero/non-zero depending on the new immediate.
	else if (MRI->use_empty(MI.getOperand(0).getReg())) {
	if (NewImm) {
	assert(Immediate && "Transformation converted zero to non-zero?");
	NewImm = Immediate;
	}
	}
	else if (ImmChanged)
	return false;
	}
	}

	LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
	LLVM_DEBUG(MI.dump());
	LLVM_DEBUG(dbgs() << "Fed by:\n");
	LLVM_DEBUG(DefMI->dump());
	LoadImmediateInfo LII;
	LII.Imm = NewImm;
	LII.Is64Bit = Is64BitLI;
	LII.SetCR = SetCR;
	// If we're setting the CR, the original load-immediate must be kept (as an
	// operand to ANDI_rec/ANDI8_rec).
	if (KilledDef && SetCR)
	*KilledDef = nullptr;
	replaceInstrWithLI(MI, LII);

	// Fixup killed/dead flag after transformation.
	// Pattern:
	// ForwardingOperandReg = LI imm1
	// y = op2 imm2, ForwardingOperandReg(killed)
	if (IsForwardingOperandKilled)
	fixupIsDeadOrKill(*DefMI, MI, ForwardingOperandReg);

	LLVM_DEBUG(dbgs() << "With:\n");
	LLVM_DEBUG(MI.dump());
	return true;
	}
	return false;
	}

	bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg,
	ImmInstrInfo &III, bool PostRA) const {
	// The vast majority of the instructions would need their operand 2 replaced
	// with an immediate when switching to the reg+imm form. A marked exception
	// are the update form loads/stores for which a constant operand 2 would need
	// to turn into a displacement and move operand 1 to the operand 2 position.
	III.ImmOpNo = 2;
	III.OpNoForForwarding = 2;
	III.ImmWidth = 16;
	III.ImmMustBeMultipleOf = 1;
	III.TruncateImmTo = 0;
	III.IsSummingOperands = false;
	switch (Opc) {
	default: return false;
	case PPC::ADD4:
	case PPC::ADD8:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 1;
	III.IsCommutative = true;
	III.IsSummingOperands = true;
	III.ImmOpcode = Opc == PPC::ADD4 ? PPC::ADDI : PPC::ADDI8;
	break;
	case PPC::ADDC:
	case PPC::ADDC8:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = true;
	III.IsSummingOperands = true;
	III.ImmOpcode = Opc == PPC::ADDC ? PPC::ADDIC : PPC::ADDIC8;
	break;
	case PPC::ADDC_rec:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = true;
	III.IsSummingOperands = true;
	III.ImmOpcode = PPC::ADDIC_rec;
	break;
	case PPC::SUBFC:
	case PPC::SUBFC8:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = false;
	III.ImmOpcode = Opc == PPC::SUBFC ? PPC::SUBFIC : PPC::SUBFIC8;
	break;
	case PPC::CMPW:
	case PPC::CMPD:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = false;
	III.ImmOpcode = Opc == PPC::CMPW ? PPC::CMPWI : PPC::CMPDI;
	break;
	case PPC::CMPLW:
	case PPC::CMPLD:
	III.SignedImm = false;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = false;
	III.ImmOpcode = Opc == PPC::CMPLW ? PPC::CMPLWI : PPC::CMPLDI;
	break;
	case PPC::AND_rec:
	case PPC::AND8_rec:
	case PPC::OR:
	case PPC::OR8:
	case PPC::XOR:
	case PPC::XOR8:
	III.SignedImm = false;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = true;
	switch(Opc) {
	default: llvm_unreachable("Unknown opcode");
	case PPC::AND_rec:
	III.ImmOpcode = PPC::ANDI_rec;
	break;
	case PPC::AND8_rec:
	III.ImmOpcode = PPC::ANDI8_rec;
	break;
	case PPC::OR: III.ImmOpcode = PPC::ORI; break;
	case PPC::OR8: III.ImmOpcode = PPC::ORI8; break;
	case PPC::XOR: III.ImmOpcode = PPC::XORI; break;
	case PPC::XOR8: III.ImmOpcode = PPC::XORI8; break;
	}
	break;
	case PPC::RLWNM:
	case PPC::RLWNM8:
	case PPC::RLWNM_rec:
	case PPC::RLWNM8_rec:
	case PPC::SLW:
	case PPC::SLW8:
	case PPC::SLW_rec:
	case PPC::SLW8_rec:
	case PPC::SRW:
	case PPC::SRW8:
	case PPC::SRW_rec:
	case PPC::SRW8_rec:
	case PPC::SRAW:
	case PPC::SRAW_rec:
	III.SignedImm = false;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = false;
	// This isn't actually true, but the instructions ignore any of the
	// upper bits, so any immediate loaded with an LI is acceptable.
	// This does not apply to shift right algebraic because a value
	// out of range will produce a -1/0.
	III.ImmWidth = 16;
	if (Opc == PPC::RLWNM \|\| Opc == PPC::RLWNM8 \|\| Opc == PPC::RLWNM_rec \|\|
	Opc == PPC::RLWNM8_rec)
	III.TruncateImmTo = 5;
	else
	III.TruncateImmTo = 6;
	switch(Opc) {
	default: llvm_unreachable("Unknown opcode");
	case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break;
	case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break;
	case PPC::RLWNM_rec:
	III.ImmOpcode = PPC::RLWINM_rec;
	break;
	case PPC::RLWNM8_rec:
	III.ImmOpcode = PPC::RLWINM8_rec;
	break;
	case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break;
	case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break;
	case PPC::SLW_rec:
	III.ImmOpcode = PPC::RLWINM_rec;
	break;
	case PPC::SLW8_rec:
	III.ImmOpcode = PPC::RLWINM8_rec;
	break;
	case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break;
	case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break;
	case PPC::SRW_rec:
	III.ImmOpcode = PPC::RLWINM_rec;
	break;
	case PPC::SRW8_rec:
	III.ImmOpcode = PPC::RLWINM8_rec;
	break;
	case PPC::SRAW:
	III.ImmWidth = 5;
	III.TruncateImmTo = 0;
	III.ImmOpcode = PPC::SRAWI;
	break;
	case PPC::SRAW_rec:
	III.ImmWidth = 5;
	III.TruncateImmTo = 0;
	III.ImmOpcode = PPC::SRAWI_rec;
	break;
	}
	break;
	case PPC::RLDCL:
	case PPC::RLDCL_rec:
	case PPC::RLDCR:
	case PPC::RLDCR_rec:
	case PPC::SLD:
	case PPC::SLD_rec:
	case PPC::SRD:
	case PPC::SRD_rec:
	case PPC::SRAD:
	case PPC::SRAD_rec:
	III.SignedImm = false;
	III.ZeroIsSpecialOrig = 0;
	III.ZeroIsSpecialNew = 0;
	III.IsCommutative = false;
	// This isn't actually true, but the instructions ignore any of the
	// upper bits, so any immediate loaded with an LI is acceptable.
	// This does not apply to shift right algebraic because a value
	// out of range will produce a -1/0.
	III.ImmWidth = 16;
	if (Opc == PPC::RLDCL \|\| Opc == PPC::RLDCL_rec \|\| Opc == PPC::RLDCR \|\|
	Opc == PPC::RLDCR_rec)
	III.TruncateImmTo = 6;
	else
	III.TruncateImmTo = 7;
	switch(Opc) {
	default: llvm_unreachable("Unknown opcode");
	case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break;
	case PPC::RLDCL_rec:
	III.ImmOpcode = PPC::RLDICL_rec;
	break;
	case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break;
	case PPC::RLDCR_rec:
	III.ImmOpcode = PPC::RLDICR_rec;
	break;
	case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break;
	case PPC::SLD_rec:
	III.ImmOpcode = PPC::RLDICR_rec;
	break;
	case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break;
	case PPC::SRD_rec:
	III.ImmOpcode = PPC::RLDICL_rec;
	break;
	case PPC::SRAD:
	III.ImmWidth = 6;
	III.TruncateImmTo = 0;
	III.ImmOpcode = PPC::SRADI;
	break;
	case PPC::SRAD_rec:
	III.ImmWidth = 6;
	III.TruncateImmTo = 0;
	III.ImmOpcode = PPC::SRADI_rec;
	break;
	}
	break;
	// Loads and stores:
	case PPC::LBZX:
	case PPC::LBZX8:
	case PPC::LHZX:
	case PPC::LHZX8:
	case PPC::LHAX:
	case PPC::LHAX8:
	case PPC::LWZX:
	case PPC::LWZX8:
	case PPC::LWAX:
	case PPC::LDX:
	case PPC::LFSX:
	case PPC::LFDX:
	case PPC::STBX:
	case PPC::STBX8:
	case PPC::STHX:
	case PPC::STHX8:
	case PPC::STWX:
	case PPC::STWX8:
	case PPC::STDX:
	case PPC::STFSX:
	case PPC::STFDX:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 1;
	III.ZeroIsSpecialNew = 2;
	III.IsCommutative = true;
	III.IsSummingOperands = true;
	III.ImmOpNo = 1;
	III.OpNoForForwarding = 2;
	switch(Opc) {
	default: llvm_unreachable("Unknown opcode");
	case PPC::LBZX: III.ImmOpcode = PPC::LBZ; break;
	case PPC::LBZX8: III.ImmOpcode = PPC::LBZ8; break;
	case PPC::LHZX: III.ImmOpcode = PPC::LHZ; break;
	case PPC::LHZX8: III.ImmOpcode = PPC::LHZ8; break;
	case PPC::LHAX: III.ImmOpcode = PPC::LHA; break;
	case PPC::LHAX8: III.ImmOpcode = PPC::LHA8; break;
	case PPC::LWZX: III.ImmOpcode = PPC::LWZ; break;
	case PPC::LWZX8: III.ImmOpcode = PPC::LWZ8; break;
	case PPC::LWAX:
	III.ImmOpcode = PPC::LWA;
	III.ImmMustBeMultipleOf = 4;
	break;
	case PPC::LDX: III.ImmOpcode = PPC::LD; III.ImmMustBeMultipleOf = 4; break;
	case PPC::LFSX: III.ImmOpcode = PPC::LFS; break;
	case PPC::LFDX: III.ImmOpcode = PPC::LFD; break;
	case PPC::STBX: III.ImmOpcode = PPC::STB; break;
	case PPC::STBX8: III.ImmOpcode = PPC::STB8; break;
	case PPC::STHX: III.ImmOpcode = PPC::STH; break;
	case PPC::STHX8: III.ImmOpcode = PPC::STH8; break;
	case PPC::STWX: III.ImmOpcode = PPC::STW; break;
	case PPC::STWX8: III.ImmOpcode = PPC::STW8; break;
	case PPC::STDX:
	III.ImmOpcode = PPC::STD;
	III.ImmMustBeMultipleOf = 4;
	break;
	case PPC::STFSX: III.ImmOpcode = PPC::STFS; break;
	case PPC::STFDX: III.ImmOpcode = PPC::STFD; break;
	}
	break;
	case PPC::LBZUX:
	case PPC::LBZUX8:
	case PPC::LHZUX:
	case PPC::LHZUX8:
	case PPC::LHAUX:
	case PPC::LHAUX8:
	case PPC::LWZUX:
	case PPC::LWZUX8:
	case PPC::LDUX:
	case PPC::LFSUX:
	case PPC::LFDUX:
	case PPC::STBUX:
	case PPC::STBUX8:
	case PPC::STHUX:
	case PPC::STHUX8:
	case PPC::STWUX:
	case PPC::STWUX8:
	case PPC::STDUX:
	case PPC::STFSUX:
	case PPC::STFDUX:
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 2;
	III.ZeroIsSpecialNew = 3;
	III.IsCommutative = false;
	III.IsSummingOperands = true;
	III.ImmOpNo = 2;
	III.OpNoForForwarding = 3;
	switch(Opc) {
	default: llvm_unreachable("Unknown opcode");
	case PPC::LBZUX: III.ImmOpcode = PPC::LBZU; break;
	case PPC::LBZUX8: III.ImmOpcode = PPC::LBZU8; break;
	case PPC::LHZUX: III.ImmOpcode = PPC::LHZU; break;
	case PPC::LHZUX8: III.ImmOpcode = PPC::LHZU8; break;
	case PPC::LHAUX: III.ImmOpcode = PPC::LHAU; break;
	case PPC::LHAUX8: III.ImmOpcode = PPC::LHAU8; break;
	case PPC::LWZUX: III.ImmOpcode = PPC::LWZU; break;
	case PPC::LWZUX8: III.ImmOpcode = PPC::LWZU8; break;
	case PPC::LDUX:
	III.ImmOpcode = PPC::LDU;
	III.ImmMustBeMultipleOf = 4;
	break;
	case PPC::LFSUX: III.ImmOpcode = PPC::LFSU; break;
	case PPC::LFDUX: III.ImmOpcode = PPC::LFDU; break;
	case PPC::STBUX: III.ImmOpcode = PPC::STBU; break;
	case PPC::STBUX8: III.ImmOpcode = PPC::STBU8; break;
	case PPC::STHUX: III.ImmOpcode = PPC::STHU; break;
	case PPC::STHUX8: III.ImmOpcode = PPC::STHU8; break;
	case PPC::STWUX: III.ImmOpcode = PPC::STWU; break;
	case PPC::STWUX8: III.ImmOpcode = PPC::STWU8; break;
	case PPC::STDUX:
	III.ImmOpcode = PPC::STDU;
	III.ImmMustBeMultipleOf = 4;
	break;
	case PPC::STFSUX: III.ImmOpcode = PPC::STFSU; break;
	case PPC::STFDUX: III.ImmOpcode = PPC::STFDU; break;
	}
	break;
	// Power9 and up only. For some of these, the X-Form version has access to all
	// 64 VSR's whereas the D-Form only has access to the VR's. We replace those
	// with pseudo-ops pre-ra and for post-ra, we check that the register loaded
	// into or stored from is one of the VR registers.
	case PPC::LXVX:
	case PPC::LXSSPX:
	case PPC::LXSDX:
	case PPC::STXVX:
	case PPC::STXSSPX:
	case PPC::STXSDX:
	case PPC::XFLOADf32:
	case PPC::XFLOADf64:
	case PPC::XFSTOREf32:
	case PPC::XFSTOREf64:
	if (!Subtarget.hasP9Vector())
	return false;
	III.SignedImm = true;
	III.ZeroIsSpecialOrig = 1;
	III.ZeroIsSpecialNew = 2;
	III.IsCommutative = true;
	III.IsSummingOperands = true;
	III.ImmOpNo = 1;
	III.OpNoForForwarding = 2;
	III.ImmMustBeMultipleOf = 4;
	switch(Opc) {
	default: llvm_unreachable("Unknown opcode");
	case PPC::LXVX:
	III.ImmOpcode = PPC::LXV;
	III.ImmMustBeMultipleOf = 16;
	break;
	case PPC::LXSSPX:
	if (PostRA) {
	if (IsVFReg)
	III.ImmOpcode = PPC::LXSSP;
	else {
	III.ImmOpcode = PPC::LFS;
	III.ImmMustBeMultipleOf = 1;
	}
	break;
	}
	LLVM_FALLTHROUGH;
	case PPC::XFLOADf32:
	III.ImmOpcode = PPC::DFLOADf32;
	break;
	case PPC::LXSDX:
	if (PostRA) {
	if (IsVFReg)
	III.ImmOpcode = PPC::LXSD;
	else {
	III.ImmOpcode = PPC::LFD;
	III.ImmMustBeMultipleOf = 1;
	}
	break;
	}
	LLVM_FALLTHROUGH;
	case PPC::XFLOADf64:
	III.ImmOpcode = PPC::DFLOADf64;
	break;
	case PPC::STXVX:
	III.ImmOpcode = PPC::STXV;
	III.ImmMustBeMultipleOf = 16;
	break;
	case PPC::STXSSPX:
	if (PostRA) {
	if (IsVFReg)
	III.ImmOpcode = PPC::STXSSP;
	else {
	III.ImmOpcode = PPC::STFS;
	III.ImmMustBeMultipleOf = 1;
	}
	break;
	}
	LLVM_FALLTHROUGH;
	case PPC::XFSTOREf32:
	III.ImmOpcode = PPC::DFSTOREf32;
	break;
	case PPC::STXSDX:
	if (PostRA) {
	if (IsVFReg)
	III.ImmOpcode = PPC::STXSD;
	else {
	III.ImmOpcode = PPC::STFD;
	III.ImmMustBeMultipleOf = 1;
	}
	break;
	}
	LLVM_FALLTHROUGH;
	case PPC::XFSTOREf64:
	III.ImmOpcode = PPC::DFSTOREf64;
	break;
	}
	break;
	}
	return true;
	}

	// Utility function for swaping two arbitrary operands of an instruction.
	static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
	assert(Op1 != Op2 && "Cannot swap operand with itself.");

	unsigned MaxOp = std::max(Op1, Op2);
	unsigned MinOp = std::min(Op1, Op2);
	MachineOperand MOp1 = MI.getOperand(MinOp);
	MachineOperand MOp2 = MI.getOperand(MaxOp);
	MI.RemoveOperand(std::max(Op1, Op2));
	MI.RemoveOperand(std::min(Op1, Op2));

	// If the operands we are swapping are the two at the end (the common case)
	// we can just remove both and add them in the opposite order.
	if (MaxOp - MinOp == 1 && MI.getNumOperands() == MinOp) {
	MI.addOperand(MOp2);
	MI.addOperand(MOp1);
	} else {
	// Store all operands in a temporary vector, remove them and re-add in the
	// right order.
	SmallVector<MachineOperand, 2> MOps;
	unsigned TotalOps = MI.getNumOperands() + 2; // We've already removed 2 ops.
	for (unsigned i = MI.getNumOperands() - 1; i >= MinOp; i--) {
	MOps.push_back(MI.getOperand(i));
	MI.RemoveOperand(i);
	}
	// MOp2 needs to be added next.
	MI.addOperand(MOp2);
	// Now add the rest.
	for (unsigned i = MI.getNumOperands(); i < TotalOps; i++) {
	if (i == MaxOp)
	MI.addOperand(MOp1);
	else {
	MI.addOperand(MOps.back());
	MOps.pop_back();
	}
	}
	}
	}

	// Check if the 'MI' that has the index OpNoForForwarding
	// meets the requirement described in the ImmInstrInfo.
	bool PPCInstrInfo::isUseMIElgibleForForwarding(MachineInstr &MI,
	const ImmInstrInfo &III,
	unsigned OpNoForForwarding
	) const {
	// As the algorithm of checking for PPC::ZERO/PPC::ZERO8
	// would not work pre-RA, we can only do the check post RA.
	MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
	if (MRI.isSSA())
	return false;

	// Cannot do the transform if MI isn't summing the operands.
	if (!III.IsSummingOperands)
	return false;

	// The instruction we are trying to replace must have the ZeroIsSpecialOrig set.
	if (!III.ZeroIsSpecialOrig)
	return false;

	// We cannot do the transform if the operand we are trying to replace
	// isn't the same as the operand the instruction allows.
	if (OpNoForForwarding != III.OpNoForForwarding)
	return false;

	// Check if the instruction we are trying to transform really has
	// the special zero register as its operand.
	if (MI.getOperand(III.ZeroIsSpecialOrig).getReg() != PPC::ZERO &&
	MI.getOperand(III.ZeroIsSpecialOrig).getReg() != PPC::ZERO8)
	return false;

	// This machine instruction is convertible if it is,
	// 1. summing the operands.
	// 2. one of the operands is special zero register.
	// 3. the operand we are trying to replace is allowed by the MI.
	return true;
	}

	// Check if the DefMI is the add inst and set the ImmMO and RegMO
	// accordingly.
	bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
	const ImmInstrInfo &III,
	MachineOperand *&ImmMO,
	MachineOperand *&RegMO) const {
	unsigned Opc = DefMI.getOpcode();
	if (Opc != PPC::ADDItocL && Opc != PPC::ADDI && Opc != PPC::ADDI8)
	return false;

	assert(DefMI.getNumOperands() >= 3 &&
	"Add inst must have at least three operands");
	RegMO = &DefMI.getOperand(1);
	ImmMO = &DefMI.getOperand(2);

	// This DefMI is elgible for forwarding if it is:
	// 1. add inst
	// 2. one of the operands is Imm/CPI/Global.
	return isAnImmediateOperand(*ImmMO);
	}

	bool PPCInstrInfo::isRegElgibleForForwarding(
	const MachineOperand &RegMO, const MachineInstr &DefMI,
	const MachineInstr &MI, bool KillDefMI,
	bool &IsFwdFeederRegKilled) const {
	// x = addi y, imm
	// ...
	// z = lfdx 0, x -> z = lfd imm(y)
	// The Reg "y" can be forwarded to the MI(z) only when there is no DEF
	// of "y" between the DEF of "x" and "z".
	// The query is only valid post RA.
	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
	if (MRI.isSSA())
	return false;

	Register Reg = RegMO.getReg();

	// Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg.
	MachineBasicBlock::const_reverse_iterator It = MI;
	MachineBasicBlock::const_reverse_iterator E = MI.getParent()->rend();
	It++;
	for (; It != E; ++It) {
	if (It->modifiesRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI)
	return false;
	else if (It->killsRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI)
	IsFwdFeederRegKilled = true;
	// Made it to DefMI without encountering a clobber.
	if ((&*It) == &DefMI)
	break;
	}
	assert((&*It) == &DefMI && "DefMI is missing");

	// If DefMI also defines the register to be forwarded, we can only forward it
	// if DefMI is being erased.
	if (DefMI.modifiesRegister(Reg, &getRegisterInfo()))
	return KillDefMI;

	return true;
	}

	bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
	const MachineInstr &DefMI,
	const ImmInstrInfo &III,
	int64_t &Imm) const {
	assert(isAnImmediateOperand(ImmMO) && "ImmMO is NOT an immediate");
	if (DefMI.getOpcode() == PPC::ADDItocL) {
	// The operand for ADDItocL is CPI, which isn't imm at compiling time,
	// However, we know that, it is 16-bit width, and has the alignment of 4.
	// Check if the instruction met the requirement.
	if (III.ImmMustBeMultipleOf > 4 \|\|
	III.TruncateImmTo \|\| III.ImmWidth != 16)
	return false;

	// Going from XForm to DForm loads means that the displacement needs to be
	// not just an immediate but also a multiple of 4, or 16 depending on the
	// load. A DForm load cannot be represented if it is a multiple of say 2.
	// XForm loads do not have this restriction.
	if (ImmMO.isGlobal() &&
	ImmMO.getGlobal()->getAlignment() < III.ImmMustBeMultipleOf)
	return false;

	return true;
	}

	if (ImmMO.isImm()) {
	// It is Imm, we need to check if the Imm fit the range.
	int64_t Immediate = ImmMO.getImm();
	// Sign-extend to 64-bits.
	Imm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
	(Immediate \| 0xFFFFFFFFFFFF0000) : Immediate;

	if (Imm % III.ImmMustBeMultipleOf)
	return false;
	if (III.TruncateImmTo)
	Imm &= ((1 << III.TruncateImmTo) - 1);
	if (III.SignedImm) {
	APInt ActualValue(64, Imm, true);
	if (!ActualValue.isSignedIntN(III.ImmWidth))
	return false;
	} else {
	uint64_t UnsignedMax = (1 << III.ImmWidth) - 1;
	if ((uint64_t)Imm > UnsignedMax)
	return false;
	}
	}
	else
	return false;

	// This ImmMO is forwarded if it meets the requriement describle
	// in ImmInstrInfo
	return true;
	}

	// If an X-Form instruction is fed by an add-immediate and one of its operands
	// is the literal zero, attempt to forward the source of the add-immediate to
	// the corresponding D-Form instruction with the displacement coming from
	// the immediate being added.
	bool PPCInstrInfo::transformToImmFormFedByAdd(
	MachineInstr &MI, const ImmInstrInfo &III, unsigned OpNoForForwarding,
	MachineInstr &DefMI, bool KillDefMI) const {
	// RegMO ImmMO
	// \| \|
	// x = addi reg, imm <----- DefMI
	// y = op 0 , x <----- MI
	// \|
	// OpNoForForwarding
	// Check if the MI meet the requirement described in the III.
	if (!isUseMIElgibleForForwarding(MI, III, OpNoForForwarding))
	return false;

	// Check if the DefMI meet the requirement
	// described in the III. If yes, set the ImmMO and RegMO accordingly.
	MachineOperand *ImmMO = nullptr;
	MachineOperand *RegMO = nullptr;
	if (!isDefMIElgibleForForwarding(DefMI, III, ImmMO, RegMO))
	return false;
	assert(ImmMO && RegMO && "Imm and Reg operand must have been set");

	// As we get the Imm operand now, we need to check if the ImmMO meet
	// the requirement described in the III. If yes set the Imm.
	int64_t Imm = 0;
	if (!isImmElgibleForForwarding(*ImmMO, DefMI, III, Imm))
	return false;

	bool IsFwdFeederRegKilled = false;
	// Check if the RegMO can be forwarded to MI.
	if (!isRegElgibleForForwarding(*RegMO, DefMI, MI, KillDefMI,
	IsFwdFeederRegKilled))
	return false;

	// Get killed info in case fixup needed after transformation.
	unsigned ForwardKilledOperandReg = ~0U;
	MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
	bool PostRA = !MRI.isSSA();
	if (PostRA && MI.getOperand(OpNoForForwarding).isKill())
	ForwardKilledOperandReg = MI.getOperand(OpNoForForwarding).getReg();

	// We know that, the MI and DefMI both meet the pattern, and
	// the Imm also meet the requirement with the new Imm-form.
	// It is safe to do the transformation now.
	LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
	LLVM_DEBUG(MI.dump());
	LLVM_DEBUG(dbgs() << "Fed by:\n");
	LLVM_DEBUG(DefMI.dump());

	// Update the base reg first.
	MI.getOperand(III.OpNoForForwarding).ChangeToRegister(RegMO->getReg(),
	false, false,
	RegMO->isKill());

	// Then, update the imm.
	if (ImmMO->isImm()) {
	// If the ImmMO is Imm, change the operand that has ZERO to that Imm
	// directly.
	replaceInstrOperandWithImm(MI, III.ZeroIsSpecialOrig, Imm);
	}
	else {
	// Otherwise, it is Constant Pool Index(CPI) or Global,
	// which is relocation in fact. We need to replace the special zero
	// register with ImmMO.
	// Before that, we need to fixup the target flags for imm.
	// For some reason, we miss to set the flag for the ImmMO if it is CPI.
	if (DefMI.getOpcode() == PPC::ADDItocL)
	ImmMO->setTargetFlags(PPCII::MO_TOC_LO);

	// MI didn't have the interface such as MI.setOperand(i) though
	// it has MI.getOperand(i). To repalce the ZERO MachineOperand with
	// ImmMO, we need to remove ZERO operand and all the operands behind it,
	// and, add the ImmMO, then, move back all the operands behind ZERO.
	SmallVector<MachineOperand, 2> MOps;
	for (unsigned i = MI.getNumOperands() - 1; i >= III.ZeroIsSpecialOrig; i--) {
	MOps.push_back(MI.getOperand(i));
	MI.RemoveOperand(i);
	}

	// Remove the last MO in the list, which is ZERO operand in fact.
	MOps.pop_back();
	// Add the imm operand.
	MI.addOperand(*ImmMO);
	// Now add the rest back.
	for (auto &MO : MOps)
	MI.addOperand(MO);
	}

	// Update the opcode.
	MI.setDesc(get(III.ImmOpcode));

	// Fix up killed/dead flag after transformation.
	// Pattern 1:
	// x = ADD KilledFwdFeederReg, imm
	// n = opn KilledFwdFeederReg(killed), regn
	// y = XOP 0, x
	// Pattern 2:
	// x = ADD reg(killed), imm
	// y = XOP 0, x
	if (IsFwdFeederRegKilled \|\| RegMO->isKill())
	fixupIsDeadOrKill(DefMI, MI, RegMO->getReg());
	// Pattern 3:
	// ForwardKilledOperandReg = ADD reg, imm
	// y = XOP 0, ForwardKilledOperandReg(killed)
	if (ForwardKilledOperandReg != ~0U)
	fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);

	LLVM_DEBUG(dbgs() << "With:\n");
	LLVM_DEBUG(MI.dump());

	return true;
	}

	bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
	const ImmInstrInfo &III,
	unsigned ConstantOpNo,
	MachineInstr &DefMI,
	int64_t Imm) const {
	MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
	bool PostRA = !MRI.isSSA();
	// Exit early if we can't convert this.
	if ((ConstantOpNo != III.OpNoForForwarding) && !III.IsCommutative)
	return false;
	if (Imm % III.ImmMustBeMultipleOf)
	return false;
	if (III.TruncateImmTo)
	Imm &= ((1 << III.TruncateImmTo) - 1);
	if (III.SignedImm) {
	APInt ActualValue(64, Imm, true);
	if (!ActualValue.isSignedIntN(III.ImmWidth))
	return false;
	} else {
	uint64_t UnsignedMax = (1 << III.ImmWidth) - 1;
	if ((uint64_t)Imm > UnsignedMax)
	return false;
	}

	// If we're post-RA, the instructions don't agree on whether register zero is
	// special, we can transform this as long as the register operand that will
	// end up in the location where zero is special isn't R0.
	if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
	unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig :
	III.ZeroIsSpecialNew + 1;
	Register OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
	Register NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
	// If R0 is in the operand where zero is special for the new instruction,
	// it is unsafe to transform if the constant operand isn't that operand.
	if ((NewZeroReg == PPC::R0 \|\| NewZeroReg == PPC::X0) &&
	ConstantOpNo != III.ZeroIsSpecialNew)
	return false;
	if ((OrigZeroReg == PPC::R0 \|\| OrigZeroReg == PPC::X0) &&
	ConstantOpNo != PosForOrigZero)
	return false;
	}

	// Get killed info in case fixup needed after transformation.
	unsigned ForwardKilledOperandReg = ~0U;
	if (PostRA && MI.getOperand(ConstantOpNo).isKill())
	ForwardKilledOperandReg = MI.getOperand(ConstantOpNo).getReg();

	unsigned Opc = MI.getOpcode();
	bool SpecialShift32 = Opc == PPC::SLW \|\| Opc == PPC::SLW_rec \|\|
	Opc == PPC::SRW \|\| Opc == PPC::SRW_rec \|\|
	Opc == PPC::SLW8 \|\| Opc == PPC::SLW8_rec \|\|
	Opc == PPC::SRW8 \|\| Opc == PPC::SRW8_rec;
	bool SpecialShift64 = Opc == PPC::SLD \|\| Opc == PPC::SLD_rec \|\|
	Opc == PPC::SRD \|\| Opc == PPC::SRD_rec;
	bool SetCR = Opc == PPC::SLW_rec \|\| Opc == PPC::SRW_rec \|\|
	Opc == PPC::SLD_rec \|\| Opc == PPC::SRD_rec;
	bool RightShift = Opc == PPC::SRW \|\| Opc == PPC::SRW_rec \|\| Opc == PPC::SRD \|\|
	Opc == PPC::SRD_rec;

	MI.setDesc(get(III.ImmOpcode));
	if (ConstantOpNo == III.OpNoForForwarding) {
	// Converting shifts to immediate form is a bit tricky since they may do
	// one of three things:
	// 1. If the shift amount is between OpSize and 2*OpSize, the result is zero
	// 2. If the shift amount is zero, the result is unchanged (save for maybe
	// setting CR0)
	// 3. If the shift amount is in [1, OpSize), it's just a shift
	if (SpecialShift32 \|\| SpecialShift64) {
	LoadImmediateInfo LII;
	LII.Imm = 0;
	LII.SetCR = SetCR;
	LII.Is64Bit = SpecialShift64;
	uint64_t ShAmt = Imm & (SpecialShift32 ? 0x1F : 0x3F);
	if (Imm & (SpecialShift32 ? 0x20 : 0x40))
	replaceInstrWithLI(MI, LII);
	// Shifts by zero don't change the value. If we don't need to set CR0,
	// just convert this to a COPY. Can't do this post-RA since we've already
	// cleaned up the copies.
	else if (!SetCR && ShAmt == 0 && !PostRA) {
	MI.RemoveOperand(2);
	MI.setDesc(get(PPC::COPY));
	} else {
	// The 32 bit and 64 bit instructions are quite different.
	if (SpecialShift32) {
	// Left shifts use (N, 0, 31-N).
	// Right shifts use (32-N, N, 31) if 0 < N < 32.
	// use (0, 0, 31) if N == 0.
	uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt;
	uint64_t MB = RightShift ? ShAmt : 0;
	uint64_t ME = RightShift ? 31 : 31 - ShAmt;
	replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH);
	MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB)
	.addImm(ME);
	} else {
	// Left shifts use (N, 63-N).
	// Right shifts use (64-N, N) if 0 < N < 64.
	// use (0, 0) if N == 0.
	uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt;
	uint64_t ME = RightShift ? ShAmt : 63 - ShAmt;
	replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH);
	MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME);
	}
	}
	} else
	replaceInstrOperandWithImm(MI, ConstantOpNo, Imm);
	}
	// Convert commutative instructions (switch the operands and convert the
	// desired one to an immediate.
	else if (III.IsCommutative) {
	replaceInstrOperandWithImm(MI, ConstantOpNo, Imm);
	swapMIOperands(MI, ConstantOpNo, III.OpNoForForwarding);
	} else
	llvm_unreachable("Should have exited early!");

	// For instructions for which the constant register replaces a different
	// operand than where the immediate goes, we need to swap them.
	if (III.OpNoForForwarding != III.ImmOpNo)
	swapMIOperands(MI, III.OpNoForForwarding, III.ImmOpNo);

	// If the special R0/X0 register index are different for original instruction
	// and new instruction, we need to fix up the register class in new
	// instruction.
	if (!PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
	if (III.ZeroIsSpecialNew) {
	// If operand at III.ZeroIsSpecialNew is physical reg(eg: ZERO/ZERO8), no
	// need to fix up register class.
	Register RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
	if (Register::isVirtualRegister(RegToModify)) {
	const TargetRegisterClass *NewRC =
	MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ?
	&PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass;
	MRI.setRegClass(RegToModify, NewRC);
	}
	}
	}

	// Fix up killed/dead flag after transformation.
	// Pattern:
	// ForwardKilledOperandReg = LI imm
	// y = XOP reg, ForwardKilledOperandReg(killed)
	if (ForwardKilledOperandReg != ~0U)
	fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
	return true;
	}

	const TargetRegisterClass *
	PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
	if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
	return &PPC::VSRCRegClass;
	return RC;
	}

	int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) {
	return PPC::getRecordFormOpcode(Opcode);
	}

	// This function returns true if the machine instruction
	// always outputs a value by sign-extending a 32 bit value,
	// i.e. 0 to 31-th bits are same as 32-th bit.
	static bool isSignExtendingOp(const MachineInstr &MI) {
	int Opcode = MI.getOpcode();
	if (Opcode == PPC::LI \|\| Opcode == PPC::LI8 \|\| Opcode == PPC::LIS \|\|
	Opcode == PPC::LIS8 \|\| Opcode == PPC::SRAW \|\| Opcode == PPC::SRAW_rec \|\|
	Opcode == PPC::SRAWI \|\| Opcode == PPC::SRAWI_rec \|\| Opcode == PPC::LWA \|\|
	Opcode == PPC::LWAX \|\| Opcode == PPC::LWA_32 \|\| Opcode == PPC::LWAX_32 \|\|
	Opcode == PPC::LHA \|\| Opcode == PPC::LHAX \|\| Opcode == PPC::LHA8 \|\|
	Opcode == PPC::LHAX8 \|\| Opcode == PPC::LBZ \|\| Opcode == PPC::LBZX \|\|
	Opcode == PPC::LBZ8 \|\| Opcode == PPC::LBZX8 \|\| Opcode == PPC::LBZU \|\|
	Opcode == PPC::LBZUX \|\| Opcode == PPC::LBZU8 \|\| Opcode == PPC::LBZUX8 \|\|
	Opcode == PPC::LHZ \|\| Opcode == PPC::LHZX \|\| Opcode == PPC::LHZ8 \|\|
	Opcode == PPC::LHZX8 \|\| Opcode == PPC::LHZU \|\| Opcode == PPC::LHZUX \|\|
	Opcode == PPC::LHZU8 \|\| Opcode == PPC::LHZUX8 \|\| Opcode == PPC::EXTSB \|\|
	Opcode == PPC::EXTSB_rec \|\| Opcode == PPC::EXTSH \|\|
	Opcode == PPC::EXTSH_rec \|\| Opcode == PPC::EXTSB8 \|\|
	Opcode == PPC::EXTSH8 \|\| Opcode == PPC::EXTSW \|\|
	Opcode == PPC::EXTSW_rec \|\| Opcode == PPC::SETB \|\| Opcode == PPC::SETB8 \|\|
	Opcode == PPC::EXTSH8_32_64 \|\| Opcode == PPC::EXTSW_32_64 \|\|
	Opcode == PPC::EXTSB8_32_64)
	return true;

	if (Opcode == PPC::RLDICL && MI.getOperand(3).getImm() >= 33)
	return true;

	if ((Opcode == PPC::RLWINM \|\| Opcode == PPC::RLWINM_rec \|\|
	Opcode == PPC::RLWNM \|\| Opcode == PPC::RLWNM_rec) &&
	MI.getOperand(3).getImm() > 0 &&
	MI.getOperand(3).getImm() <= MI.getOperand(4).getImm())
	return true;

	return false;
	}

	// This function returns true if the machine instruction
	// always outputs zeros in higher 32 bits.
	static bool isZeroExtendingOp(const MachineInstr &MI) {
	int Opcode = MI.getOpcode();
	// The 16-bit immediate is sign-extended in li/lis.
	// If the most significant bit is zero, all higher bits are zero.
	if (Opcode == PPC::LI \|\| Opcode == PPC::LI8 \|\|
	Opcode == PPC::LIS \|\| Opcode == PPC::LIS8) {
	int64_t Imm = MI.getOperand(1).getImm();
	if (((uint64_t)Imm & ~0x7FFFuLL) == 0)
	return true;
	}

	// We have some variations of rotate-and-mask instructions
	// that clear higher 32-bits.
	if ((Opcode == PPC::RLDICL \|\| Opcode == PPC::RLDICL_rec \|\|
	Opcode == PPC::RLDCL \|\| Opcode == PPC::RLDCL_rec \|\|
	Opcode == PPC::RLDICL_32_64) &&
	MI.getOperand(3).getImm() >= 32)
	return true;

	if ((Opcode == PPC::RLDIC \|\| Opcode == PPC::RLDIC_rec) &&
	MI.getOperand(3).getImm() >= 32 &&
	MI.getOperand(3).getImm() <= 63 - MI.getOperand(2).getImm())
	return true;

	if ((Opcode == PPC::RLWINM \|\| Opcode == PPC::RLWINM_rec \|\|
	Opcode == PPC::RLWNM \|\| Opcode == PPC::RLWNM_rec \|\|
	Opcode == PPC::RLWINM8 \|\| Opcode == PPC::RLWNM8) &&
	MI.getOperand(3).getImm() <= MI.getOperand(4).getImm())
	return true;

	// There are other instructions that clear higher 32-bits.
	if (Opcode == PPC::CNTLZW \|\| Opcode == PPC::CNTLZW_rec \|\|
	Opcode == PPC::CNTTZW \|\| Opcode == PPC::CNTTZW_rec \|\|
	Opcode == PPC::CNTLZW8 \|\| Opcode == PPC::CNTTZW8 \|\|
	Opcode == PPC::CNTLZD \|\| Opcode == PPC::CNTLZD_rec \|\|
	Opcode == PPC::CNTTZD \|\| Opcode == PPC::CNTTZD_rec \|\|
	Opcode == PPC::POPCNTD \|\| Opcode == PPC::POPCNTW \|\| Opcode == PPC::SLW \|\|
	Opcode == PPC::SLW_rec \|\| Opcode == PPC::SRW \|\| Opcode == PPC::SRW_rec \|\|
	Opcode == PPC::SLW8 \|\| Opcode == PPC::SRW8 \|\| Opcode == PPC::SLWI \|\|
	Opcode == PPC::SLWI_rec \|\| Opcode == PPC::SRWI \|\|
	Opcode == PPC::SRWI_rec \|\| Opcode == PPC::LWZ \|\| Opcode == PPC::LWZX \|\|
	Opcode == PPC::LWZU \|\| Opcode == PPC::LWZUX \|\| Opcode == PPC::LWBRX \|\|
	Opcode == PPC::LHBRX \|\| Opcode == PPC::LHZ \|\| Opcode == PPC::LHZX \|\|
	Opcode == PPC::LHZU \|\| Opcode == PPC::LHZUX \|\| Opcode == PPC::LBZ \|\|
	Opcode == PPC::LBZX \|\| Opcode == PPC::LBZU \|\| Opcode == PPC::LBZUX \|\|
	Opcode == PPC::LWZ8 \|\| Opcode == PPC::LWZX8 \|\| Opcode == PPC::LWZU8 \|\|
	Opcode == PPC::LWZUX8 \|\| Opcode == PPC::LWBRX8 \|\| Opcode == PPC::LHBRX8 \|\|
	Opcode == PPC::LHZ8 \|\| Opcode == PPC::LHZX8 \|\| Opcode == PPC::LHZU8 \|\|
	Opcode == PPC::LHZUX8 \|\| Opcode == PPC::LBZ8 \|\| Opcode == PPC::LBZX8 \|\|
	Opcode == PPC::LBZU8 \|\| Opcode == PPC::LBZUX8 \|\|
	Opcode == PPC::ANDI_rec \|\| Opcode == PPC::ANDIS_rec \|\|
	Opcode == PPC::ROTRWI \|\| Opcode == PPC::ROTRWI_rec \|\|
	Opcode == PPC::EXTLWI \|\| Opcode == PPC::EXTLWI_rec \|\|
	Opcode == PPC::MFVSRWZ)
	return true;

	return false;
	}

	// This function returns true if the input MachineInstr is a TOC save
	// instruction.
	bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
	if (!MI.getOperand(1).isImm() \|\| !MI.getOperand(2).isReg())
	return false;
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	unsigned StackOffset = MI.getOperand(1).getImm();
	Register StackReg = MI.getOperand(2).getReg();
	if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset)
	return true;

	return false;
	}

	// We limit the max depth to track incoming values of PHIs or binary ops
	// (e.g. AND) to avoid excessive cost.
	const unsigned MAX_DEPTH = 1;

	bool
	PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
	const unsigned Depth) const {
	const MachineFunction *MF = MI.getParent()->getParent();
	const MachineRegisterInfo *MRI = &MF->getRegInfo();

	// If we know this instruction returns sign- or zero-extended result,
	// return true.
	if (SignExt ? isSignExtendingOp(MI):
	isZeroExtendingOp(MI))
	return true;

	switch (MI.getOpcode()) {
	case PPC::COPY: {
	Register SrcReg = MI.getOperand(1).getReg();

	// In both ELFv1 and v2 ABI, method parameters and the return value
	// are sign- or zero-extended.
	if (MF->getSubtarget<PPCSubtarget>().isSVR4ABI()) {
	const PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
	// We check the ZExt/SExt flags for a method parameter.
	if (MI.getParent()->getBasicBlock() ==
	&MF->getFunction().getEntryBlock()) {
	Register VReg = MI.getOperand(0).getReg();
	if (MF->getRegInfo().isLiveIn(VReg))
	return SignExt ? FuncInfo->isLiveInSExt(VReg) :
	FuncInfo->isLiveInZExt(VReg);
	}

	// For a method return value, we check the ZExt/SExt flags in attribute.
	// We assume the following code sequence for method call.
	// ADJCALLSTACKDOWN 32, implicit dead %r1, implicit %r1
	// BL8_NOP @func,...
	// ADJCALLSTACKUP 32, 0, implicit dead %r1, implicit %r1
	// %5 = COPY %x3; G8RC:%5
	if (SrcReg == PPC::X3) {
	const MachineBasicBlock *MBB = MI.getParent();
	MachineBasicBlock::const_instr_iterator II =
	MachineBasicBlock::const_instr_iterator(&MI);
	if (II != MBB->instr_begin() &&
	(--II)->getOpcode() == PPC::ADJCALLSTACKUP) {
	const MachineInstr &CallMI = *(--II);
	if (CallMI.isCall() && CallMI.getOperand(0).isGlobal()) {
	const Function *CalleeFn =
	dyn_cast<Function>(CallMI.getOperand(0).getGlobal());
	if (!CalleeFn)
	return false;
	const IntegerType *IntTy =
	dyn_cast<IntegerType>(CalleeFn->getReturnType());
	const AttributeSet &Attrs =
	CalleeFn->getAttributes().getRetAttributes();
	if (IntTy && IntTy->getBitWidth() <= 32)
	return Attrs.hasAttribute(SignExt ? Attribute::SExt :
	Attribute::ZExt);
	}
	}
	}
	}

	// If this is a copy from another register, we recursively check source.
	if (!Register::isVirtualRegister(SrcReg))
	return false;
	const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
	if (SrcMI != NULL)
	return isSignOrZeroExtended(*SrcMI, SignExt, Depth);

	return false;
	}

	case PPC::ANDI_rec:
	case PPC::ANDIS_rec:
	case PPC::ORI:
	case PPC::ORIS:
	case PPC::XORI:
	case PPC::XORIS:
	case PPC::ANDI8_rec:
	case PPC::ANDIS8_rec:
	case PPC::ORI8:
	case PPC::ORIS8:
	case PPC::XORI8:
	case PPC::XORIS8: {
	// logical operation with 16-bit immediate does not change the upper bits.
	// So, we track the operand register as we do for register copy.
	Register SrcReg = MI.getOperand(1).getReg();
	if (!Register::isVirtualRegister(SrcReg))
	return false;
	const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
	if (SrcMI != NULL)
	return isSignOrZeroExtended(*SrcMI, SignExt, Depth);

	return false;
	}

	// If all incoming values are sign-/zero-extended,
	// the output of OR, ISEL or PHI is also sign-/zero-extended.
	case PPC::OR:
	case PPC::OR8:
	case PPC::ISEL:
	case PPC::PHI: {
	if (Depth >= MAX_DEPTH)
	return false;

	// The input registers for PHI are operand 1, 3, ...
	// The input registers for others are operand 1 and 2.
	unsigned E = 3, D = 1;
	if (MI.getOpcode() == PPC::PHI) {
	E = MI.getNumOperands();
	D = 2;
	}

	for (unsigned I = 1; I != E; I += D) {
	if (MI.getOperand(I).isReg()) {
	Register SrcReg = MI.getOperand(I).getReg();
	if (!Register::isVirtualRegister(SrcReg))
	return false;
	const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
	if (SrcMI == NULL \|\| !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
	return false;
	}
	else
	return false;
	}
	return true;
	}

	// If at least one of the incoming values of an AND is zero extended
	// then the output is also zero-extended. If both of the incoming values
	// are sign-extended then the output is also sign extended.
	case PPC::AND:
	case PPC::AND8: {
	if (Depth >= MAX_DEPTH)
	return false;

	assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg());

	Register SrcReg1 = MI.getOperand(1).getReg();
	Register SrcReg2 = MI.getOperand(2).getReg();

	if (!Register::isVirtualRegister(SrcReg1) \|\|
	!Register::isVirtualRegister(SrcReg2))
	return false;

	const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1);
	const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2);
	if (!MISrc1 \|\| !MISrc2)
	return false;

	if(SignExt)
	return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) &&
	isSignOrZeroExtended(*MISrc2, SignExt, Depth+1);
	else
	return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) \|\|
	isSignOrZeroExtended(*MISrc2, SignExt, Depth+1);
	}

	default:
	break;
	}
	return false;
	}

	bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
	return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
	}

	namespace {
	class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
	MachineInstr Loop, EndLoop, *LoopCount;
	MachineFunction *MF;
	const TargetInstrInfo *TII;
	int64_t TripCount;

	public:
	PPCPipelinerLoopInfo(MachineInstr Loop, MachineInstr EndLoop,
	MachineInstr *LoopCount)
	: Loop(Loop), EndLoop(EndLoop), LoopCount(LoopCount),
	MF(Loop->getParent()->getParent()),
	TII(MF->getSubtarget().getInstrInfo()) {
	// Inspect the Loop instruction up-front, as it may be deleted when we call
	// createTripCountGreaterCondition.
	if (LoopCount->getOpcode() == PPC::LI8 \|\| LoopCount->getOpcode() == PPC::LI)
	TripCount = LoopCount->getOperand(1).getImm();
	else
	TripCount = -1;
	}

	bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
	// Only ignore the terminator.
	return MI == EndLoop;
	}

	Optional<bool>
	createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
	SmallVectorImpl<MachineOperand> &Cond) override {
	if (TripCount == -1) {
	// Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
	// so we don't need to generate any thing here.
	Cond.push_back(MachineOperand::CreateImm(0));
	Cond.push_back(MachineOperand::CreateReg(
	MF->getSubtarget<PPCSubtarget>().isPPC64() ? PPC::CTR8 : PPC::CTR,
	true));
	return {};
	}

	return TripCount > TC;
	}

	void setPreheader(MachineBasicBlock *NewPreheader) override {
	// Do nothing. We want the LOOP setup instruction to stay in the old
	// preheader, so we can use BDZ in the prologs to adapt the loop trip count.
	}

	void adjustTripCount(int TripCountAdjust) override {
	// If the loop trip count is a compile-time value, then just change the
	// value.
	if (LoopCount->getOpcode() == PPC::LI8 \|\|
	LoopCount->getOpcode() == PPC::LI) {
	int64_t TripCount = LoopCount->getOperand(1).getImm() + TripCountAdjust;
	LoopCount->getOperand(1).setImm(TripCount);
	return;
	}

	// Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
	// so we don't need to generate any thing here.
	}

	void disposed() override {
	Loop->eraseFromParent();
	// Ensure the loop setup instruction is deleted too.
	LoopCount->eraseFromParent();
	}
	};
	} // namespace

	std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
	PPCInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
	// We really "analyze" only hardware loops right now.
	MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
	MachineBasicBlock Preheader = LoopBB->pred_begin();
	if (Preheader == LoopBB)
	Preheader = *std::next(LoopBB->pred_begin());
	MachineFunction *MF = Preheader->getParent();

	if (I != LoopBB->end() && isBDNZ(I->getOpcode())) {
	SmallPtrSet<MachineBasicBlock *, 8> Visited;
	if (MachineInstr LoopInst = findLoopInstr(Preheader, Visited)) {
	Register LoopCountReg = LoopInst->getOperand(0).getReg();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
	return std::make_unique<PPCPipelinerLoopInfo>(LoopInst, &*I, LoopCount);
	}
	}
	return nullptr;
	}

	MachineInstr *PPCInstrInfo::findLoopInstr(
	MachineBasicBlock &PreHeader,
	SmallPtrSet<MachineBasicBlock *, 8> &Visited) const {

	unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);

	// The loop set-up instruction should be in preheader
	for (auto &I : PreHeader.instrs())
	if (I.getOpcode() == LOOPi)
	return &I;
	return nullptr;
	}

	// Return true if get the base operand, byte offset of an instruction and the
	// memory width. Width is the size of memory that is being loaded/stored.
	bool PPCInstrInfo::getMemOperandWithOffsetWidth(
	const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset,
	unsigned &Width, const TargetRegisterInfo *TRI) const {
	if (!LdSt.mayLoadOrStore())
	return false;

	// Handle only loads/stores with base register followed by immediate offset.
	if (LdSt.getNumExplicitOperands() != 3)
	return false;
	if (!LdSt.getOperand(1).isImm() \|\| !LdSt.getOperand(2).isReg())
	return false;

	if (!LdSt.hasOneMemOperand())
	return false;

	Width = (*LdSt.memoperands_begin())->getSize();
	Offset = LdSt.getOperand(1).getImm();
	BaseReg = &LdSt.getOperand(2);
	return true;
	}

	bool PPCInstrInfo::areMemAccessesTriviallyDisjoint(
	const MachineInstr &MIa, const MachineInstr &MIb) const {
	assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
	assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");

	if (MIa.hasUnmodeledSideEffects() \|\| MIb.hasUnmodeledSideEffects() \|\|
	MIa.hasOrderedMemoryRef() \|\| MIb.hasOrderedMemoryRef())
	return false;

	// Retrieve the base register, offset from the base register and width. Width
	// is the size of memory that is being loaded/stored (e.g. 1, 2, 4). If
	// base registers are identical, and the offset of a lower memory access +
	// the width doesn't overlap the offset of a higher memory access,
	// then the memory accesses are different.
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	const MachineOperand BaseOpA = nullptr, BaseOpB = nullptr;
	int64_t OffsetA = 0, OffsetB = 0;
	unsigned int WidthA = 0, WidthB = 0;
	if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
	getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
	if (BaseOpA->isIdenticalTo(*BaseOpB)) {
	int LowOffset = std::min(OffsetA, OffsetB);
	int HighOffset = std::max(OffsetA, OffsetB);
	int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
	if (LowOffset + LowWidth <= HighOffset)
	return true;
	}
	}
	return false;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td (revision 362609)
	@@ -1,4420 +1,4437 @@
	//===- PPCInstrVSX.td - The PowerPC VSX Extension --- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the VSX extension to the PowerPC instruction set.
	//
	//===----------------------------------------------------------------------===//

	// ********************************* NOTE *********************************
	// For POWER8 Little Endian, the VSX swap optimization relies on knowing
	// which VMX and VSX instructions are lane-sensitive and which are not.
	// A lane-sensitive instruction relies, implicitly or explicitly, on
	// whether lanes are numbered from left to right. An instruction like
	// VADDFP is not lane-sensitive, because each lane of the result vector
	// relies only on the corresponding lane of the source vectors. However,
	// an instruction like VMULESB is lane-sensitive, because "even" and
	// "odd" lanes are different for big-endian and little-endian numbering.
	//
	// When adding new VMX and VSX instructions, please consider whether they
	// are lane-sensitive. If so, they must be added to a switch statement
	// in PPCVSXSwapRemoval::gatherVectorInstructions().
	// ****************************************************************************

	def PPCRegVSRCAsmOperand : AsmOperandClass {
	let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
	}
	def vsrc : RegisterOperand<VSRC> {
	let ParserMatchClass = PPCRegVSRCAsmOperand;
	}

	def PPCRegVSFRCAsmOperand : AsmOperandClass {
	let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
	}
	def vsfrc : RegisterOperand<VSFRC> {
	let ParserMatchClass = PPCRegVSFRCAsmOperand;
	}

	def PPCRegVSSRCAsmOperand : AsmOperandClass {
	let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber";
	}
	def vssrc : RegisterOperand<VSSRC> {
	let ParserMatchClass = PPCRegVSSRCAsmOperand;
	}

	def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
	let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber";
	}

	def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
	let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
	}

	def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [
	SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
	]>;

	def SDT_PPCfpexth : SDTypeProfile<1, 2, [
	SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2>
	]>;

	def SDT_PPCldsplat : SDTypeProfile<1, 1, [
	SDTCisVec<0>, SDTCisPtrTy<1>
	]>;

	// Little-endian-specific nodes.
	def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
	SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
	]>;
	def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [
	SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
	]>;
	def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
	SDTCisSameAs<0, 1>
	]>;
	def SDTVecConv : SDTypeProfile<1, 2, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
	]>;
	def SDTVabsd : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>
	]>;
	def SDT_PPCld_vec_be : SDTypeProfile<1, 1, [
	SDTCisVec<0>, SDTCisPtrTy<1>
	]>;
	def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [
	SDTCisVec<0>, SDTCisPtrTy<1>
	]>;

	def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
	[SDNPHasChain, SDNPMayStore]>;
	def PPCld_vec_be : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
	[SDNPHasChain, SDNPMayStore]>;
	def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
	def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
	def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
	def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
	def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
	def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
	def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
	def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;

	def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
	def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

	multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
	string asmstr, InstrItinClass itin, Intrinsic Int,
	ValueType OutTy, ValueType InTy> {
	let BaseName = asmbase in {
	def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	[(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>;
	let Defs = [CR6] in
	def _rec : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[(set InTy:$XT,
	(InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
	isRecordForm;
	}
	}

	// Instruction form with a single input register for instructions such as
	// XXPERMDI. The reason for defining this is that specifying multiple chained
	// operands (such as loads) to an instruction will perform both chained
	// operations rather than coalescing them into a single register - even though
	// the source memory location is the same. This simply forces the instruction
	// to use the same register for both inputs.
	// For example, an output DAG such as this:
	// (XXPERMDI (LXSIBZX xoaddr:$src), (LXSIBZX xoaddr:$src ), 0))
	// would result in two load instructions emitted and used as separate inputs
	// to the XXPERMDI instruction.
	class XX3Form_2s<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
	InstrItinClass itin, list<dag> pattern>
	: XX3Form_2<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
	let XB = XA;
	}

	def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
	def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
	def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
	def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;

	let Predicates = [HasVSX] in {
	let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
	let hasSideEffects = 0 in { // VSX instructions don't have side effects.

	// Load indexed instructions
	let mayLoad = 1, mayStore = 0 in {
	let CodeSize = 3 in
	def LXSDX : XX1Form_memOp<31, 588,
	(outs vsfrc:$XT), (ins memrr:$src),
	"lxsdx $XT, $src", IIC_LdStLFD,
	[]>;

	// Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later
	let CodeSize = 3 in
	def XFLOADf64 : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
	"#XFLOADf64",
	[(set f64:$XT, (load xoaddr:$src))]>;

	let Predicates = [HasVSX, HasOnlySwappingMemOps] in
	def LXVD2X : XX1Form_memOp<31, 844,
	(outs vsrc:$XT), (ins memrr:$src),
	"lxvd2x $XT, $src", IIC_LdStLFD,
	[(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;

	def LXVDSX : XX1Form_memOp<31, 332,
	(outs vsrc:$XT), (ins memrr:$src),
	"lxvdsx $XT, $src", IIC_LdStLFD, []>;

	let Predicates = [HasVSX, HasOnlySwappingMemOps] in
	def LXVW4X : XX1Form_memOp<31, 780,
	(outs vsrc:$XT), (ins memrr:$src),
	"lxvw4x $XT, $src", IIC_LdStLFD,
	[]>;
	} // mayLoad

	// Store indexed instructions
	let mayStore = 1, mayLoad = 0 in {
	let CodeSize = 3 in
	def STXSDX : XX1Form_memOp<31, 716,
	(outs), (ins vsfrc:$XT, memrr:$dst),
	"stxsdx $XT, $dst", IIC_LdStSTFD,
	[]>;

	// Pseudo instruction XFSTOREf64 will be expanded to STXSDX or STFDX later
	let CodeSize = 3 in
	def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
	"#XFSTOREf64",
	[(store f64:$XT, xoaddr:$dst)]>;

	let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
	// The behaviour of this instruction is endianness-specific so we provide no
	// pattern to match it without considering endianness.
	def STXVD2X : XX1Form_memOp<31, 972,
	(outs), (ins vsrc:$XT, memrr:$dst),
	"stxvd2x $XT, $dst", IIC_LdStSTFD,
	[]>;

	def STXVW4X : XX1Form_memOp<31, 908,
	(outs), (ins vsrc:$XT, memrr:$dst),
	"stxvw4x $XT, $dst", IIC_LdStSTFD,
	[]>;
	}
	} // mayStore

	let Uses = [RM] in {
	// Add/Mul Instructions
	let isCommutable = 1 in {
	def XSADDDP : XX3Form<60, 32,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xsadddp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fadd f64:$XA, f64:$XB))]>;
	def XSMULDP : XX3Form<60, 48,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xsmuldp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fmul f64:$XA, f64:$XB))]>;

	def XVADDDP : XX3Form<60, 96,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvadddp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>;

	def XVADDSP : XX3Form<60, 64,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvaddsp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>;

	def XVMULDP : XX3Form<60, 112,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvmuldp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>;

	def XVMULSP : XX3Form<60, 80,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvmulsp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>;
	}

	// Subtract Instructions
	def XSSUBDP : XX3Form<60, 40,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xssubdp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fsub f64:$XA, f64:$XB))]>;

	def XVSUBDP : XX3Form<60, 104,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvsubdp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>;
	def XVSUBSP : XX3Form<60, 72,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvsubsp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>;

	// FMA Instructions
	let BaseName = "XSMADDADP" in {
	let isCommutable = 1 in
	def XSMADDADP : XX3Form<60, 33,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsmaddadp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSMADDMDP : XX3Form<60, 41,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XSMSUBADP" in {
	let isCommutable = 1 in
	def XSMSUBADP : XX3Form<60, 49,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsmsubadp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSMSUBMDP : XX3Form<60, 57,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XSNMADDADP" in {
	let isCommutable = 1 in
	def XSNMADDADP : XX3Form<60, 161,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSNMADDMDP : XX3Form<60, 169,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XSNMSUBADP" in {
	let isCommutable = 1 in
	def XSNMSUBADP : XX3Form<60, 177,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSNMSUBMDP : XX3Form<60, 185,
	(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
	"xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVMADDADP" in {
	let isCommutable = 1 in
	def XVMADDADP : XX3Form<60, 97,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmaddadp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVMADDMDP : XX3Form<60, 105,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVMADDASP" in {
	let isCommutable = 1 in
	def XVMADDASP : XX3Form<60, 65,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmaddasp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVMADDMSP : XX3Form<60, 73,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVMSUBADP" in {
	let isCommutable = 1 in
	def XVMSUBADP : XX3Form<60, 113,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmsubadp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVMSUBMDP : XX3Form<60, 121,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVMSUBASP" in {
	let isCommutable = 1 in
	def XVMSUBASP : XX3Form<60, 81,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmsubasp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVMSUBMSP : XX3Form<60, 89,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVNMADDADP" in {
	let isCommutable = 1 in
	def XVNMADDADP : XX3Form<60, 225,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVNMADDMDP : XX3Form<60, 233,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVNMADDASP" in {
	let isCommutable = 1 in
	def XVNMADDASP : XX3Form<60, 193,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVNMADDMSP : XX3Form<60, 201,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVNMSUBADP" in {
	let isCommutable = 1 in
	def XVNMSUBADP : XX3Form<60, 241,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVNMSUBMDP : XX3Form<60, 249,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XVNMSUBASP" in {
	let isCommutable = 1 in
	def XVNMSUBASP : XX3Form<60, 209,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XVNMSUBMSP : XX3Form<60, 217,
	(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
	"xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	// Division Instructions
	def XSDIVDP : XX3Form<60, 56,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xsdivdp $XT, $XA, $XB", IIC_FPDivD,
	[(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>;
	def XSSQRTDP : XX2Form<60, 75,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xssqrtdp $XT, $XB", IIC_FPSqrtD,
	[(set f64:$XT, (fsqrt f64:$XB))]>;

	def XSREDP : XX2Form<60, 90,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsredp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfre f64:$XB))]>;
	def XSRSQRTEDP : XX2Form<60, 74,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsrsqrtedp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfrsqrte f64:$XB))]>;

	def XSTDIVDP : XX3Form_1<60, 61,
	(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
	"xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
	def XSTSQRTDP : XX2Form_1<60, 106,
	(outs crrc:$crD), (ins vsfrc:$XB),
	"xstsqrtdp $crD, $XB", IIC_FPCompare, []>;

	def XVDIVDP : XX3Form<60, 120,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvdivdp $XT, $XA, $XB", IIC_FPDivD,
	[(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>;
	def XVDIVSP : XX3Form<60, 88,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvdivsp $XT, $XA, $XB", IIC_FPDivS,
	[(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>;

	def XVSQRTDP : XX2Form<60, 203,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvsqrtdp $XT, $XB", IIC_FPSqrtD,
	[(set v2f64:$XT, (fsqrt v2f64:$XB))]>;
	def XVSQRTSP : XX2Form<60, 139,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvsqrtsp $XT, $XB", IIC_FPSqrtS,
	[(set v4f32:$XT, (fsqrt v4f32:$XB))]>;

	def XVTDIVDP : XX3Form_1<60, 125,
	(outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
	"xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
	def XVTDIVSP : XX3Form_1<60, 93,
	(outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
	"xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;

	def XVTSQRTDP : XX2Form_1<60, 234,
	(outs crrc:$crD), (ins vsrc:$XB),
	"xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
	def XVTSQRTSP : XX2Form_1<60, 170,
	(outs crrc:$crD), (ins vsrc:$XB),
	"xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;

	def XVREDP : XX2Form<60, 218,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvredp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (PPCfre v2f64:$XB))]>;
	def XVRESP : XX2Form<60, 154,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvresp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (PPCfre v4f32:$XB))]>;

	def XVRSQRTEDP : XX2Form<60, 202,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrsqrtedp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>;
	def XVRSQRTESP : XX2Form<60, 138,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrsqrtesp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>;

	// Compare Instructions
	def XSCMPODP : XX3Form_1<60, 43,
	(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
	"xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>;
	def XSCMPUDP : XX3Form_1<60, 35,
	(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
	"xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;

	defm XVCMPEQDP : XX3Form_Rcr<60, 99,
	"xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
	int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
	defm XVCMPEQSP : XX3Form_Rcr<60, 67,
	"xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare,
	int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>;
	defm XVCMPGEDP : XX3Form_Rcr<60, 115,
	"xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare,
	int_ppc_vsx_xvcmpgedp, v2i64, v2f64>;
	defm XVCMPGESP : XX3Form_Rcr<60, 83,
	"xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare,
	int_ppc_vsx_xvcmpgesp, v4i32, v4f32>;
	defm XVCMPGTDP : XX3Form_Rcr<60, 107,
	"xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare,
	int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>;
	defm XVCMPGTSP : XX3Form_Rcr<60, 75,
	"xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare,
	int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;

	// Move Instructions
	def XSABSDP : XX2Form<60, 345,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsabsdp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (fabs f64:$XB))]>;
	def XSNABSDP : XX2Form<60, 361,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsnabsdp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (fneg (fabs f64:$XB)))]>;
	def XSNEGDP : XX2Form<60, 377,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsnegdp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (fneg f64:$XB))]>;
	def XSCPSGNDP : XX3Form<60, 176,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xscpsgndp $XT, $XA, $XB", IIC_VecFP,
	[(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>;

	def XVABSDP : XX2Form<60, 473,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvabsdp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fabs v2f64:$XB))]>;

	def XVABSSP : XX2Form<60, 409,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvabssp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fabs v4f32:$XB))]>;

	def XVCPSGNDP : XX3Form<60, 240,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvcpsgndp $XT, $XA, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>;
	def XVCPSGNSP : XX3Form<60, 208,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvcpsgnsp $XT, $XA, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>;

	def XVNABSDP : XX2Form<60, 489,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvnabsdp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>;
	def XVNABSSP : XX2Form<60, 425,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvnabssp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>;

	def XVNEGDP : XX2Form<60, 505,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvnegdp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fneg v2f64:$XB))]>;
	def XVNEGSP : XX2Form<60, 441,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvnegsp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fneg v4f32:$XB))]>;

	// Conversion Instructions
	def XSCVDPSP : XX2Form<60, 265,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvdpsp $XT, $XB", IIC_VecFP, []>;
	def XSCVDPSXDS : XX2Form<60, 344,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvdpsxds $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfctidz f64:$XB))]>;
	let isCodeGenOnly = 1 in
	def XSCVDPSXDSs : XX2Form<60, 344,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xscvdpsxds $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfctidz f32:$XB))]>;
	def XSCVDPSXWS : XX2Form<60, 88,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvdpsxws $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfctiwz f64:$XB))]>;
	let isCodeGenOnly = 1 in
	def XSCVDPSXWSs : XX2Form<60, 88,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xscvdpsxws $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfctiwz f32:$XB))]>;
	def XSCVDPUXDS : XX2Form<60, 328,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvdpuxds $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfctiduz f64:$XB))]>;
	let isCodeGenOnly = 1 in
	def XSCVDPUXDSs : XX2Form<60, 328,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xscvdpuxds $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfctiduz f32:$XB))]>;
	def XSCVDPUXWS : XX2Form<60, 72,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvdpuxws $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
	let isCodeGenOnly = 1 in
	def XSCVDPUXWSs : XX2Form<60, 72,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xscvdpuxws $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfctiwuz f32:$XB))]>;
	def XSCVSPDP : XX2Form<60, 329,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvspdp $XT, $XB", IIC_VecFP, []>;
	def XSCVSXDDP : XX2Form<60, 376,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvsxddp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfcfid f64:$XB))]>;
	def XSCVUXDDP : XX2Form<60, 360,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xscvuxddp $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (PPCfcfidu f64:$XB))]>;

	def XVCVDPSP : XX2Form<60, 393,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvdpsp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (int_ppc_vsx_xvcvdpsp v2f64:$XB))]>;
	def XVCVDPSXDS : XX2Form<60, 472,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvdpsxds $XT, $XB", IIC_VecFP,
	[(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
	def XVCVDPSXWS : XX2Form<60, 216,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvdpsxws $XT, $XB", IIC_VecFP,
	[(set v4i32:$XT, (int_ppc_vsx_xvcvdpsxws v2f64:$XB))]>;
	def XVCVDPUXDS : XX2Form<60, 456,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvdpuxds $XT, $XB", IIC_VecFP,
	[(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
	def XVCVDPUXWS : XX2Form<60, 200,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvdpuxws $XT, $XB", IIC_VecFP,
	[(set v4i32:$XT, (int_ppc_vsx_xvcvdpuxws v2f64:$XB))]>;

	def XVCVSPDP : XX2Form<60, 457,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvspdp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (int_ppc_vsx_xvcvspdp v4f32:$XB))]>;
	def XVCVSPSXDS : XX2Form<60, 408,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvspsxds $XT, $XB", IIC_VecFP, []>;
	def XVCVSPSXWS : XX2Form<60, 152,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvspsxws $XT, $XB", IIC_VecFP,
	[(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>;
	def XVCVSPUXDS : XX2Form<60, 392,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvspuxds $XT, $XB", IIC_VecFP, []>;
	def XVCVSPUXWS : XX2Form<60, 136,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvspuxws $XT, $XB", IIC_VecFP,
	[(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>;
	def XVCVSXDDP : XX2Form<60, 504,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvsxddp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
	def XVCVSXDSP : XX2Form<60, 440,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvsxdsp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>;
	def XVCVSXWDP : XX2Form<60, 248,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvsxwdp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
	def XVCVSXWSP : XX2Form<60, 184,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvsxwsp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
	def XVCVUXDDP : XX2Form<60, 488,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvuxddp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
	def XVCVUXDSP : XX2Form<60, 424,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvuxdsp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>;
	def XVCVUXWDP : XX2Form<60, 232,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvuxwdp $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>;
	def XVCVUXWSP : XX2Form<60, 168,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvcvuxwsp $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>;

	// Rounding Instructions
	def XSRDPI : XX2Form<60, 73,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsrdpi $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (fround f64:$XB))]>;
	def XSRDPIC : XX2Form<60, 107,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsrdpic $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (fnearbyint f64:$XB))]>;
	def XSRDPIM : XX2Form<60, 121,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsrdpim $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (ffloor f64:$XB))]>;
	def XSRDPIP : XX2Form<60, 105,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsrdpip $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (fceil f64:$XB))]>;
	def XSRDPIZ : XX2Form<60, 89,
	(outs vsfrc:$XT), (ins vsfrc:$XB),
	"xsrdpiz $XT, $XB", IIC_VecFP,
	[(set f64:$XT, (ftrunc f64:$XB))]>;

	def XVRDPI : XX2Form<60, 201,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrdpi $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fround v2f64:$XB))]>;
	def XVRDPIC : XX2Form<60, 235,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrdpic $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
	def XVRDPIM : XX2Form<60, 249,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrdpim $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (ffloor v2f64:$XB))]>;
	def XVRDPIP : XX2Form<60, 233,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrdpip $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (fceil v2f64:$XB))]>;
	def XVRDPIZ : XX2Form<60, 217,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrdpiz $XT, $XB", IIC_VecFP,
	[(set v2f64:$XT, (ftrunc v2f64:$XB))]>;

	def XVRSPI : XX2Form<60, 137,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrspi $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fround v4f32:$XB))]>;
	def XVRSPIC : XX2Form<60, 171,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrspic $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
	def XVRSPIM : XX2Form<60, 185,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrspim $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (ffloor v4f32:$XB))]>;
	def XVRSPIP : XX2Form<60, 169,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrspip $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (fceil v4f32:$XB))]>;
	def XVRSPIZ : XX2Form<60, 153,
	(outs vsrc:$XT), (ins vsrc:$XB),
	"xvrspiz $XT, $XB", IIC_VecFP,
	[(set v4f32:$XT, (ftrunc v4f32:$XB))]>;

	// Max/Min Instructions
	let isCommutable = 1 in {
	def XSMAXDP : XX3Form<60, 160,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xsmaxdp $XT, $XA, $XB", IIC_VecFP,
	[(set vsfrc:$XT,
	(int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
	def XSMINDP : XX3Form<60, 168,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xsmindp $XT, $XA, $XB", IIC_VecFP,
	[(set vsfrc:$XT,
	(int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;

	def XVMAXDP : XX3Form<60, 224,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvmaxdp $XT, $XA, $XB", IIC_VecFP,
	[(set vsrc:$XT,
	(int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
	def XVMINDP : XX3Form<60, 232,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvmindp $XT, $XA, $XB", IIC_VecFP,
	[(set vsrc:$XT,
	(int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;

	def XVMAXSP : XX3Form<60, 192,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvmaxsp $XT, $XA, $XB", IIC_VecFP,
	[(set vsrc:$XT,
	(int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
	def XVMINSP : XX3Form<60, 200,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xvminsp $XT, $XA, $XB", IIC_VecFP,
	[(set vsrc:$XT,
	(int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
	} // isCommutable
	} // Uses = [RM]

	// Logical Instructions
	let isCommutable = 1 in
	def XXLAND : XX3Form<60, 130,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxland $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>;
	def XXLANDC : XX3Form<60, 138,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxlandc $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (and v4i32:$XA,
	(vnot_ppc v4i32:$XB)))]>;
	let isCommutable = 1 in {
	def XXLNOR : XX3Form<60, 162,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxlnor $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (vnot_ppc (or v4i32:$XA,
	v4i32:$XB)))]>;
	def XXLOR : XX3Form<60, 146,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxlor $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>;
	let isCodeGenOnly = 1 in
	def XXLORf: XX3Form<60, 146,
	(outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
	"xxlor $XT, $XA, $XB", IIC_VecGeneral, []>;
	def XXLXOR : XX3Form<60, 154,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxlxor $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
	} // isCommutable

	let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
	isReMaterializable = 1 in {
	def XXLXORz : XX3Form_SameOp<60, 154, (outs vsrc:$XT), (ins),
	"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
	[(set v4i32:$XT, (v4i32 immAllZerosV))]>;
	def XXLXORdpz : XX3Form_SameOp<60, 154,
	(outs vsfrc:$XT), (ins),
	"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
	[(set f64:$XT, (fpimm0))]>;
	def XXLXORspz : XX3Form_SameOp<60, 154,
	(outs vssrc:$XT), (ins),
	"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
	[(set f32:$XT, (fpimm0))]>;
	}

	// Permutation Instructions
	def XXMRGHW : XX3Form<60, 18,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>;
	def XXMRGLW : XX3Form<60, 50,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>;

	def XXPERMDI : XX3Form_2<60, 10,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
	"xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm,
	[(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB,
	imm32SExt16:$DM))]>;
	let isCodeGenOnly = 1 in
	def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
	"xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
	def XXSEL : XX4Form<60, 3,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
	"xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;

	def XXSLDWI : XX3Form_2<60, 2,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW),
	"xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm,
	[(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB,
	imm32SExt16:$SHW))]>;

	let isCodeGenOnly = 1 in
	def XXSLDWIs : XX3Form_2s<60, 2,
	(outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$SHW),
	"xxsldwi $XT, $XA, $XA, $SHW", IIC_VecPerm, []>;

	def XXSPLTW : XX2Form_2<60, 164,
	(outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
	"xxspltw $XT, $XB, $UIM", IIC_VecPerm,
	[(set v4i32:$XT,
	(PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>;
	let isCodeGenOnly = 1 in
	def XXSPLTWs : XX2Form_2<60, 164,
	(outs vsrc:$XT), (ins vsfrc:$XB, u2imm:$UIM),
	"xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;

	} // hasSideEffects

	// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
	// instruction selection into a branch sequence.
	let PPC970_Single = 1 in {

	def SELECT_CC_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
	(ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
	"#SELECT_CC_VSRC",
	[]>;
	def SELECT_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
	(ins crbitrc:$cond, vsrc:$T, vsrc:$F),
	"#SELECT_VSRC",
	[(set v2f64:$dst,
	(select i1:$cond, v2f64:$T, v2f64:$F))]>;
	def SELECT_CC_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
	(ins crrc:$cond, f8rc:$T, f8rc:$F,
	i32imm:$BROPC), "#SELECT_CC_VSFRC",
	[]>;
	def SELECT_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
	(ins crbitrc:$cond, f8rc:$T, f8rc:$F),
	"#SELECT_VSFRC",
	[(set f64:$dst,
	(select i1:$cond, f64:$T, f64:$F))]>;
	def SELECT_CC_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
	(ins crrc:$cond, f4rc:$T, f4rc:$F,
	i32imm:$BROPC), "#SELECT_CC_VSSRC",
	[]>;
	def SELECT_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
	(ins crbitrc:$cond, f4rc:$T, f4rc:$F),
	"#SELECT_VSSRC",
	[(set f32:$dst,
	(select i1:$cond, f32:$T, f32:$F))]>;
	}
	} // AddedComplexity

	def : InstAlias<"xvmovdp $XT, $XB",
	(XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
	def : InstAlias<"xvmovsp $XT, $XB",
	(XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;

	def : InstAlias<"xxspltd $XT, $XB, 0",
	(XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
	def : InstAlias<"xxspltd $XT, $XB, 1",
	(XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
	def : InstAlias<"xxmrghd $XT, $XA, $XB",
	(XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
	def : InstAlias<"xxmrgld $XT, $XA, $XB",
	(XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
	def : InstAlias<"xxswapd $XT, $XB",
	(XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
	def : InstAlias<"xxspltd $XT, $XB, 0",
	(XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
	def : InstAlias<"xxspltd $XT, $XB, 1",
	(XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
	def : InstAlias<"xxswapd $XT, $XB",
	(XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;

	let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.

	def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
	(v4i32 (XXLNOR $A, $A))>;
	def : Pat<(v4i32 (or (and (vnot_ppc v4i32:$C), v4i32:$A),
	(and v4i32:$B, v4i32:$C))),
	(v4i32 (XXSEL $A, $B, $C))>;

	let Predicates = [IsBigEndian] in {
	def : Pat<(v2f64 (scalar_to_vector f64:$A)),
	(v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;

	def : Pat<(f64 (extractelt v2f64:$S, 0)),
	(f64 (EXTRACT_SUBREG $S, sub_64))>;
	def : Pat<(f64 (extractelt v2f64:$S, 1)),
	(f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
	}

	let Predicates = [IsLittleEndian] in {
	def : Pat<(v2f64 (scalar_to_vector f64:$A)),
	(v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
	(SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;

	def : Pat<(f64 (extractelt v2f64:$S, 0)),
	(f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
	def : Pat<(f64 (extractelt v2f64:$S, 1)),
	(f64 (EXTRACT_SUBREG $S, sub_64))>;
	}

	// Additional fnmsub patterns: -ab + c == -(ab - c)
	def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C),
	(XSNMSUBADP $C, $A, $B)>;
	def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C),
	(XSNMSUBADP $C, $A, $B)>;

	def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C),
	(XVNMSUBADP $C, $A, $B)>;
	def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C),
	(XVNMSUBADP $C, $A, $B)>;

	def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C),
	(XVNMSUBASP $C, $A, $B)>;
	def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C),
	(XVNMSUBASP $C, $A, $B)>;

	def : Pat<(v2f64 (bitconvert v4f32:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;
	def : Pat<(v2f64 (bitconvert v4i32:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;
	def : Pat<(v2f64 (bitconvert v8i16:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;
	def : Pat<(v2f64 (bitconvert v16i8:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;

	def : Pat<(v4f32 (bitconvert v2f64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v4i32 (bitconvert v2f64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v8i16 (bitconvert v2f64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v16i8 (bitconvert v2f64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;

	def : Pat<(v2i64 (bitconvert v4f32:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;
	def : Pat<(v2i64 (bitconvert v4i32:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;
	def : Pat<(v2i64 (bitconvert v8i16:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;
	def : Pat<(v2i64 (bitconvert v16i8:$A)),
	(COPY_TO_REGCLASS $A, VSRC)>;

	def : Pat<(v4f32 (bitconvert v2i64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v4i32 (bitconvert v2i64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v8i16 (bitconvert v2i64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v16i8 (bitconvert v2i64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;

	def : Pat<(v2f64 (bitconvert v2i64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v2i64 (bitconvert v2f64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;

	def : Pat<(v2f64 (bitconvert v1i128:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v1i128 (bitconvert v2f64:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;

	def : Pat<(v2i64 (bitconvert f128:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v4i32 (bitconvert f128:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v8i16 (bitconvert f128:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;
	def : Pat<(v16i8 (bitconvert f128:$A)),
	(COPY_TO_REGCLASS $A, VRRC)>;

	def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
	(v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
	def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
	(v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;

	def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
	(v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
	def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
	(v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;

	def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>;
	def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>;

	// Loads.
	let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
	def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;

	// Stores.
	def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
	(STXVD2X $rS, xoaddr:$dst)>;
	def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
	}

	// Load vector big endian order
	let Predicates = [IsLittleEndian, HasVSX] in {
	def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
	def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
	def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
	def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
	def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
	def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
	def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
	def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
	}

	let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
	def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
	def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
	def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
	def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>;
	def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
	def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
	def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
	def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
	(STXVW4X $rS, xoaddr:$dst)>;
	}

	// Permutes.
	def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
	def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
	def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
	def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
	def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;

	// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and
	// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable.
	def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>;

	// Selects.
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
	(SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)),
	(SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)),
	(SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)),
	(SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)),
	(SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)),
	(SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)),
	(SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)),
	(SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)),
	(SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)),
	(SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>;

	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
	(SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
	(SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
	(SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
	(SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
	(SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
	(SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
	(SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
	(SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
	(SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
	(SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>;

	// Divides.
	def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
	(XVDIVSP $A, $B)>;
	def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
	(XVDIVDP $A, $B)>;

	// Reciprocal estimate
	def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
	(XVRESP $A)>;
	def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
	(XVREDP $A)>;

	// Recip. square root estimate
	def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
	(XVRSQRTESP $A)>;
	def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
	(XVRSQRTEDP $A)>;

	// Vector selection
	def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)),
	(COPY_TO_REGCLASS
	(XXSEL (COPY_TO_REGCLASS $vC, VSRC),
	(COPY_TO_REGCLASS $vB, VSRC),
	(COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
	def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)),
	(COPY_TO_REGCLASS
	(XXSEL (COPY_TO_REGCLASS $vC, VSRC),
	(COPY_TO_REGCLASS $vB, VSRC),
	(COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
	def : Pat<(vselect v4i32:$vA, v4i32:$vB, v4i32:$vC),
	(XXSEL $vC, $vB, $vA)>;
	def : Pat<(vselect v2i64:$vA, v2i64:$vB, v2i64:$vC),
	(XXSEL $vC, $vB, $vA)>;
	def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC),
	(XXSEL $vC, $vB, $vA)>;
	def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC),
	(XXSEL $vC, $vB, $vA)>;

	def : Pat<(v4f32 (fmaxnum v4f32:$src1, v4f32:$src2)),
	(v4f32 (XVMAXSP $src1, $src2))>;
	def : Pat<(v4f32 (fminnum v4f32:$src1, v4f32:$src2)),
	(v4f32 (XVMINSP $src1, $src2))>;
	def : Pat<(v2f64 (fmaxnum v2f64:$src1, v2f64:$src2)),
	(v2f64 (XVMAXDP $src1, $src2))>;
	def : Pat<(v2f64 (fminnum v2f64:$src1, v2f64:$src2)),
	(v2f64 (XVMINDP $src1, $src2))>;

	let Predicates = [IsLittleEndian] in {
	def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
	(f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
	def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
	(f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
	def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
	(f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
	def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
	(f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
	} // IsLittleEndian

	let Predicates = [IsBigEndian] in {
	def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
	(f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
	def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
	(f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
	def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
	(f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
	def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
	(f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
	} // IsBigEndian

	} // AddedComplexity
	} // HasVSX

	def FpMinMax {
	dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC),
	(COPY_TO_REGCLASS $B, VSFRC)),
	VSSRC);
	dag F32Max = (COPY_TO_REGCLASS (XSMAXDP (COPY_TO_REGCLASS $A, VSFRC),
	(COPY_TO_REGCLASS $B, VSFRC)),
	VSSRC);
	}

	let AddedComplexity = 400, Predicates = [HasVSX] in {
	// f32 Min.
	def : Pat<(f32 (fminnum_ieee f32:$A, f32:$B)),
	(f32 FpMinMax.F32Min)>;
	def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), f32:$B)),
	(f32 FpMinMax.F32Min)>;
	def : Pat<(f32 (fminnum_ieee f32:$A, (fcanonicalize f32:$B))),
	(f32 FpMinMax.F32Min)>;
	def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))),
	(f32 FpMinMax.F32Min)>;
	// F32 Max.
	def : Pat<(f32 (fmaxnum_ieee f32:$A, f32:$B)),
	(f32 FpMinMax.F32Max)>;
	def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), f32:$B)),
	(f32 FpMinMax.F32Max)>;
	def : Pat<(f32 (fmaxnum_ieee f32:$A, (fcanonicalize f32:$B))),
	(f32 FpMinMax.F32Max)>;
	def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))),
	(f32 FpMinMax.F32Max)>;

	// f64 Min.
	def : Pat<(f64 (fminnum_ieee f64:$A, f64:$B)),
	(f64 (XSMINDP $A, $B))>;
	def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), f64:$B)),
	(f64 (XSMINDP $A, $B))>;
	def : Pat<(f64 (fminnum_ieee f64:$A, (fcanonicalize f64:$B))),
	(f64 (XSMINDP $A, $B))>;
	def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
	(f64 (XSMINDP $A, $B))>;
	// f64 Max.
	def : Pat<(f64 (fmaxnum_ieee f64:$A, f64:$B)),
	(f64 (XSMAXDP $A, $B))>;
	def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), f64:$B)),
	(f64 (XSMAXDP $A, $B))>;
	def : Pat<(f64 (fmaxnum_ieee f64:$A, (fcanonicalize f64:$B))),
	(f64 (XSMAXDP $A, $B))>;
	def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
	(f64 (XSMAXDP $A, $B))>;
	}

	def ScalarLoads {
	dag Li8 = (i32 (extloadi8 xoaddr:$src));
	dag ZELi8 = (i32 (zextloadi8 xoaddr:$src));
	dag ZELi8i64 = (i64 (zextloadi8 xoaddr:$src));
	dag SELi8 = (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
	dag SELi8i64 = (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));

	dag Li16 = (i32 (extloadi16 xoaddr:$src));
	dag ZELi16 = (i32 (zextloadi16 xoaddr:$src));
	dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
	dag SELi16 = (i32 (sextloadi16 xoaddr:$src));
	dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));

	dag Li32 = (i32 (load xoaddr:$src));
	}

	def DWToSPExtractConv {
	dag El0US1 = (f32 (PPCfcfidus
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 0))))));
	dag El1US1 = (f32 (PPCfcfidus
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 1))))));
	dag El0US2 = (f32 (PPCfcfidus
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 0))))));
	dag El1US2 = (f32 (PPCfcfidus
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 1))))));
	dag El0SS1 = (f32 (PPCfcfids
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 0))))));
	dag El1SS1 = (f32 (PPCfcfids
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 1))))));
	dag El0SS2 = (f32 (PPCfcfids
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 0))))));
	dag El1SS2 = (f32 (PPCfcfids
	(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 1))))));
	dag BVU = (v4f32 (build_vector El0US1, El1US1, El0US2, El1US2));
	dag BVS = (v4f32 (build_vector El0SS1, El1SS1, El0SS2, El1SS2));
	}

	// The following VSX instructions were introduced in Power ISA 2.07
	/* FIXME: if the operands are v2i64, these patterns will not match.
	we should define new patterns or otherwise match the same patterns
	when the elements are larger than i32.
	*/
	def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
	def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
	def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
	let Predicates = [HasP8Vector] in {
	let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
	let isCommutable = 1 in {
	def XXLEQV : XX3Form<60, 186,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxleqv $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>;
	def XXLNAND : XX3Form<60, 178,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxlnand $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
	v4i32:$XB)))]>;
	} // isCommutable

	def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
	(XXLEQV $A, $B)>;

	let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
	isReMaterializable = 1 in {
	def XXLEQVOnes : XX3Form_SameOp<60, 186, (outs vsrc:$XT), (ins),
	"xxleqv $XT, $XT, $XT", IIC_VecGeneral,
	[(set v4i32:$XT, (bitconvert (v16i8 immAllOnesV)))]>;
	}

	def XXLORC : XX3Form<60, 170,
	(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
	"xxlorc $XT, $XA, $XB", IIC_VecGeneral,
	[(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;

	// VSX scalar loads introduced in ISA 2.07
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
	let CodeSize = 3 in
	def LXSSPX : XX1Form_memOp<31, 524, (outs vssrc:$XT), (ins memrr:$src),
	"lxsspx $XT, $src", IIC_LdStLFD, []>;
	def LXSIWAX : XX1Form_memOp<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
	"lxsiwax $XT, $src", IIC_LdStLFD, []>;
	def LXSIWZX : XX1Form_memOp<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
	"lxsiwzx $XT, $src", IIC_LdStLFD, []>;

	// Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
	let CodeSize = 3 in
	def XFLOADf32 : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src),
	"#XFLOADf32",
	[(set f32:$XT, (load xoaddr:$src))]>;
	// Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
	def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
	"#LIWAX",
	[(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
	// Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
	def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
	"#LIWZX",
	[(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
	} // mayLoad

	// VSX scalar stores introduced in ISA 2.07
	let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
	let CodeSize = 3 in
	def STXSSPX : XX1Form_memOp<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
	"stxsspx $XT, $dst", IIC_LdStSTFD, []>;
	def STXSIWX : XX1Form_memOp<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
	"stxsiwx $XT, $dst", IIC_LdStSTFD, []>;

	// Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
	let CodeSize = 3 in
	def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst),
	"#XFSTOREf32",
	[(store f32:$XT, xoaddr:$dst)]>;
	// Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
	def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
	"#STIWX",
	[(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
	} // mayStore

	def : Pat<(f64 (extloadf32 xoaddr:$src)),
	(COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
	def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))),
	(f32 (XFLOADf32 xoaddr:$src))>;
	def : Pat<(f64 (fpextend f32:$src)),
	(COPY_TO_REGCLASS $src, VSFRC)>;

	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
	(SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
	(SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
	(SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
	(SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
	(SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
	(SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
	(SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
	(SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
	(SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
	(SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;

	// VSX Elementary Scalar FP arithmetic (SP)
	let isCommutable = 1 in {
	def XSADDSP : XX3Form<60, 0,
	(outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
	"xsaddsp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fadd f32:$XA, f32:$XB))]>;
	def XSMULSP : XX3Form<60, 16,
	(outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
	"xsmulsp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fmul f32:$XA, f32:$XB))]>;
	} // isCommutable
	def XSSUBSP : XX3Form<60, 8,
	(outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
	"xssubsp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
	def XSDIVSP : XX3Form<60, 24,
	(outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
	"xsdivsp $XT, $XA, $XB", IIC_FPDivS,
	[(set f32:$XT, (fdiv f32:$XA, f32:$XB))]>;
	def XSRESP : XX2Form<60, 26,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xsresp $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfre f32:$XB))]>;
	def XSRSP : XX2Form<60, 281,
	(outs vssrc:$XT), (ins vsfrc:$XB),
	"xsrsp $XT, $XB", IIC_VecFP, []>;
	def XSSQRTSP : XX2Form<60, 11,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xssqrtsp $XT, $XB", IIC_FPSqrtS,
	[(set f32:$XT, (fsqrt f32:$XB))]>;
	def XSRSQRTESP : XX2Form<60, 10,
	(outs vssrc:$XT), (ins vssrc:$XB),
	"xsrsqrtesp $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfrsqrte f32:$XB))]>;

	// FMA Instructions
	let BaseName = "XSMADDASP" in {
	let isCommutable = 1 in
	def XSMADDASP : XX3Form<60, 1,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsmaddasp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSMADDMSP : XX3Form<60, 9,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XSMSUBASP" in {
	let isCommutable = 1 in
	def XSMSUBASP : XX3Form<60, 17,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsmsubasp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fma f32:$XA, f32:$XB,
	(fneg f32:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSMSUBMSP : XX3Form<60, 25,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XSNMADDASP" in {
	let isCommutable = 1 in
	def XSNMADDASP : XX3Form<60, 129,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
	f32:$XTi)))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSNMADDMSP : XX3Form<60, 137,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	let BaseName = "XSNMSUBASP" in {
	let isCommutable = 1 in
	def XSNMSUBASP : XX3Form<60, 145,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
	[(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
	(fneg f32:$XTi))))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	let IsVSXFMAAlt = 1 in
	def XSNMSUBMSP : XX3Form<60, 153,
	(outs vssrc:$XT),
	(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
	"xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
	AltVSXFMARel;
	}

	// Additional xsnmsubasp patterns: -ab + c == -(ab - c)
	def : Pat<(fma (fneg f32:$A), f32:$B, f32:$C),
	(XSNMSUBASP $C, $A, $B)>;
	def : Pat<(fma f32:$A, (fneg f32:$B), f32:$C),
	(XSNMSUBASP $C, $A, $B)>;

	// Single Precision Conversions (FP <-> INT)
	def XSCVSXDSP : XX2Form<60, 312,
	(outs vssrc:$XT), (ins vsfrc:$XB),
	"xscvsxdsp $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfcfids f64:$XB))]>;
	def XSCVUXDSP : XX2Form<60, 296,
	(outs vssrc:$XT), (ins vsfrc:$XB),
	"xscvuxdsp $XT, $XB", IIC_VecFP,
	[(set f32:$XT, (PPCfcfidus f64:$XB))]>;

	// Conversions between vector and scalar single precision
	def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
	"xscvdpspn $XT, $XB", IIC_VecFP, []>;
	def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
	"xscvspdpn $XT, $XB", IIC_VecFP, []>;

	let Predicates = [IsLittleEndian] in {
	def : Pat<DWToSPExtractConv.El0SS1,
	(f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
	def : Pat<DWToSPExtractConv.El1SS1,
	(f32 (XSCVSXDSP (COPY_TO_REGCLASS
	(f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
	def : Pat<DWToSPExtractConv.El0US1,
	(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
	def : Pat<DWToSPExtractConv.El1US1,
	(f32 (XSCVUXDSP (COPY_TO_REGCLASS
	(f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
	}

	let Predicates = [IsBigEndian] in {
	def : Pat<DWToSPExtractConv.El0SS1,
	(f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
	def : Pat<DWToSPExtractConv.El1SS1,
	(f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
	def : Pat<DWToSPExtractConv.El0US1,
	(f32 (XSCVUXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
	def : Pat<DWToSPExtractConv.El1US1,
	(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
	}

	// Instructions for converting float to i64 feeding a store.
	let Predicates = [NoP9Vector] in {
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8),
	(STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8),
	(STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>;
	}

	// Instructions for converting float to i32 feeding a store.
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4),
	(STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
	(STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;

	def : Pat<(v2i64 (smax v2i64:$src1, v2i64:$src2)),
	(v2i64 (VMAXSD (COPY_TO_REGCLASS $src1, VRRC),
	(COPY_TO_REGCLASS $src2, VRRC)))>;
	def : Pat<(v2i64 (umax v2i64:$src1, v2i64:$src2)),
	(v2i64 (VMAXUD (COPY_TO_REGCLASS $src1, VRRC),
	(COPY_TO_REGCLASS $src2, VRRC)))>;
	def : Pat<(v2i64 (smin v2i64:$src1, v2i64:$src2)),
	(v2i64 (VMINSD (COPY_TO_REGCLASS $src1, VRRC),
	(COPY_TO_REGCLASS $src2, VRRC)))>;
	def : Pat<(v2i64 (umin v2i64:$src1, v2i64:$src2)),
	(v2i64 (VMINUD (COPY_TO_REGCLASS $src1, VRRC),
	(COPY_TO_REGCLASS $src2, VRRC)))>;
	} // AddedComplexity = 400
	} // HasP8Vector

	let AddedComplexity = 400 in {
	let Predicates = [HasDirectMove] in {
	// VSX direct move instructions
	def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
	"mfvsrd $rA, $XT", IIC_VecGeneral,
	[(set i64:$rA, (PPCmfvsr f64:$XT))]>,
	Requires<[In64BitMode]>;
	let isCodeGenOnly = 1 in
	def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsrc:$XT),
	"mfvsrd $rA, $XT", IIC_VecGeneral,
	[]>,
	Requires<[In64BitMode]>;
	def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
	"mfvsrwz $rA, $XT", IIC_VecGeneral,
	[(set i32:$rA, (PPCmfvsr f64:$XT))]>;
	let isCodeGenOnly = 1 in
	def MFVRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsrc:$XT),
	"mfvsrwz $rA, $XT", IIC_VecGeneral,
	[]>;
	def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA),
	"mtvsrd $XT, $rA", IIC_VecGeneral,
	[(set f64:$XT, (PPCmtvsra i64:$rA))]>,
	Requires<[In64BitMode]>;
	let isCodeGenOnly = 1 in
	def MTVRD : XX1_RS6_RD5_XO<31, 179, (outs vsrc:$XT), (ins g8rc:$rA),
	"mtvsrd $XT, $rA", IIC_VecGeneral,
	[]>,
	Requires<[In64BitMode]>;
	def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA),
	"mtvsrwa $XT, $rA", IIC_VecGeneral,
	[(set f64:$XT, (PPCmtvsra i32:$rA))]>;
	let isCodeGenOnly = 1 in
	def MTVRWA : XX1_RS6_RD5_XO<31, 211, (outs vsrc:$XT), (ins gprc:$rA),
	"mtvsrwa $XT, $rA", IIC_VecGeneral,
	[]>;
	def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
	"mtvsrwz $XT, $rA", IIC_VecGeneral,
	[(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
	let isCodeGenOnly = 1 in
	def MTVRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsrc:$XT), (ins gprc:$rA),
	"mtvsrwz $XT, $rA", IIC_VecGeneral,
	[]>;
	} // HasDirectMove

	let Predicates = [IsISA3_0, HasDirectMove] in {
	def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
	"mtvsrws $XT, $rA", IIC_VecGeneral, []>;

	def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
	"mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
	[]>, Requires<[In64BitMode]>;

	def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT),
	"mfvsrld $rA, $XT", IIC_VecGeneral,
	[]>, Requires<[In64BitMode]>;

	} // IsISA3_0, HasDirectMove
	} // AddedComplexity = 400

	// We want to parse this from asm, but we don't want to emit this as it would
	// be emitted with a VSX reg. So leave Emit = 0 here.
	def : InstAlias<"mfvrd $rA, $XT",
	(MFVRD g8rc:$rA, vrrc:$XT), 0>;
	def : InstAlias<"mffprd $rA, $src",
	(MFVSRD g8rc:$rA, f8rc:$src)>;
	def : InstAlias<"mtvrd $XT, $rA",
	(MTVRD vrrc:$XT, g8rc:$rA), 0>;
	def : InstAlias<"mtfprd $dst, $rA",
	(MTVSRD f8rc:$dst, g8rc:$rA)>;
	def : InstAlias<"mfvrwz $rA, $XT",
	(MFVRWZ gprc:$rA, vrrc:$XT), 0>;
	def : InstAlias<"mffprwz $rA, $src",
	(MFVSRWZ gprc:$rA, f8rc:$src)>;
	def : InstAlias<"mtvrwa $XT, $rA",
	(MTVRWA vrrc:$XT, gprc:$rA), 0>;
	def : InstAlias<"mtfprwa $dst, $rA",
	(MTVSRWA f8rc:$dst, gprc:$rA)>;
	def : InstAlias<"mtvrwz $XT, $rA",
	(MTVRWZ vrrc:$XT, gprc:$rA), 0>;
	def : InstAlias<"mtfprwz $dst, $rA",
	(MTVSRWZ f8rc:$dst, gprc:$rA)>;

	/* Direct moves of various widths from GPR's into VSR's. Each move lines
	the value up into element 0 (both BE and LE). Namely, entities smaller than
	a doubleword are shifted left and moved for BE. For LE, they're moved, then
	swapped to go into the least significant element of the VSR.
	*/
	def MovesToVSR {
	dag BE_BYTE_0 =
	(MTVSRD
	(RLDICR
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7));
	dag BE_HALF_0 =
	(MTVSRD
	(RLDICR
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15));
	dag BE_WORD_0 =
	(MTVSRD
	(RLDICR
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31));
	dag BE_DWORD_0 = (MTVSRD $A);

	dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32));
	dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
	LE_MTVSRW, sub_64));
	dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2);
	dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
	BE_DWORD_0, sub_64));
	dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
	}

	/* Patterns for extracting elements out of vectors. Integer elements are
	extracted using direct move operations. Patterns for extracting elements
	whose indices are not available at compile time are also provided with
	various _VARIABLE_ patterns.
	The numbering for the DAG's is for LE, but when used on BE, the correct
	LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
	*/
	def VectorExtractions {
	// Doubleword extraction
	dag LE_DWORD_0 =
	(MFVSRD
	(EXTRACT_SUBREG
	(XXPERMDI (COPY_TO_REGCLASS $S, VSRC),
	(COPY_TO_REGCLASS $S, VSRC), 2), sub_64));
	dag LE_DWORD_1 = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));

	// Word extraction
	dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64));
	dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64));
	dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
	dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64));

	// Halfword extraction
	dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32));
	dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32));
	dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32));
	dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32));
	dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32));
	dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32));
	dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32));
	dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32));

	// Byte extraction
	dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32));
	dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32));
	dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32));
	dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32));
	dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32));
	dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32));
	dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32));
	dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32));
	dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32));
	dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32));
	dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32));
	dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32));
	dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32));
	dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32));
	dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32));
	dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32));

	/* Variable element number (BE and LE patterns must be specified separately)
	This is a rather involved process.

	Conceptually, this is how the move is accomplished:
	1. Identify which doubleword contains the element
	2. Shift in the VMX register so that the correct doubleword is correctly
	lined up for the MFVSRD
	3. Perform the move so that the element (along with some extra stuff)
	is in the GPR
	4. Right shift within the GPR so that the element is right-justified

	Of course, the index is an element number which has a different meaning
	on LE/BE so the patterns have to be specified separately.

	Note: The final result will be the element right-justified with high
	order bits being arbitrarily defined (namely, whatever was in the
	vector register to the left of the value originally).
	*/

	/* LE variable byte
	Number 1. above:
	- For elements 0-7, we shift left by 8 bytes since they're on the right
	- For elements 8-15, we need not shift (shift left by zero bytes)
	This is accomplished by inverting the bits of the index and AND-ing
	with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
	*/
	dag LE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDC8 (LI8 8), $Idx)));

	// Number 2. above:
	// - Now that we set up the shift amount, we shift in the VMX register
	dag LE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, LE_VBYTE_PERM_VEC));

	// Number 3. above:
	// - The doubleword containing our element is moved to a GPR
	dag LE_MV_VBYTE = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)),
	sub_64));

	/* Number 4. above:
	- Truncate the element number to the range 0-7 (8-15 are symmetrical
	and out of range values are truncated accordingly)
	- Multiply by 8 as we need to shift right by the number of bits, not bytes
	- Shift right in the GPR by the calculated value
	*/
	dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60),
	sub_32);
	dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
	sub_32);

	/* LE variable halfword
	Number 1. above:
	- For elements 0-3, we shift left by 8 since they're on the right
	- For elements 4-7, we need not shift (shift left by zero bytes)
	Similarly to the byte pattern, we invert the bits of the index, but we
	AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
	Of course, the shift is still by 8 bytes, so we must multiply by 2.
	*/
	dag LE_VHALF_PERM_VEC =
	(v16i8 (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62)));

	// Number 2. above:
	// - Now that we set up the shift amount, we shift in the VMX register
	dag LE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, LE_VHALF_PERM_VEC));

	// Number 3. above:
	// - The doubleword containing our element is moved to a GPR
	dag LE_MV_VHALF = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)),
	sub_64));

	/* Number 4. above:
	- Truncate the element number to the range 0-3 (4-7 are symmetrical
	and out of range values are truncated accordingly)
	- Multiply by 16 as we need to shift right by the number of bits
	- Shift right in the GPR by the calculated value
	*/
	dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59),
	sub_32);
	dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
	sub_32);

	/* LE variable word
	Number 1. above:
	- For elements 0-1, we shift left by 8 since they're on the right
	- For elements 2-3, we need not shift
	*/
	dag LE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
	(RLDICR (ANDC8 (LI8 2), $Idx), 2, 61)));

	// Number 2. above:
	// - Now that we set up the shift amount, we shift in the VMX register
	dag LE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VWORD_PERM_VEC));

	// Number 3. above:
	// - The doubleword containing our element is moved to a GPR
	dag LE_MV_VWORD = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
	sub_64));

	/* Number 4. above:
	- Truncate the element number to the range 0-1 (2-3 are symmetrical
	and out of range values are truncated accordingly)
	- Multiply by 32 as we need to shift right by the number of bits
	- Shift right in the GPR by the calculated value
	*/
	dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
	sub_32);
	dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
	sub_32);

	/* LE variable doubleword
	Number 1. above:
	- For element 0, we shift left by 8 since it's on the right
	- For element 1, we need not shift
	*/
	dag LE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
	(RLDICR (ANDC8 (LI8 1), $Idx), 3, 60)));

	// Number 2. above:
	// - Now that we set up the shift amount, we shift in the VMX register
	dag LE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VDWORD_PERM_VEC));

	// Number 3. above:
	// - The doubleword containing our element is moved to a GPR
	// - Number 4. is not needed for the doubleword as the value is 64-bits
	dag LE_VARIABLE_DWORD =
	(MFVSRD (EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
	sub_64));

	/* LE variable float
	- Shift the vector to line up the desired element to BE Word 0
	- Convert 32-bit float to a 64-bit single precision float
	*/
	dag LE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8,
	(RLDICR (XOR8 (LI8 3), $Idx), 2, 61)));
	dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
	dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);

	/* LE variable double
	Same as the LE doubleword except there is no move.
	*/
	dag LE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
	(v16i8 (COPY_TO_REGCLASS $S, VRRC)),
	LE_VDWORD_PERM_VEC));
	dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);

	/* BE variable byte
	The algorithm here is the same as the LE variable byte except:
	- The shift in the VMX register is by 0/8 for opposite element numbers so
	we simply AND the element number with 0x8
	- The order of elements after the move to GPR is reversed, so we invert
	the bits of the index prior to truncating to the range 0-7
	*/
	dag BE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDI8_rec $Idx, 8)));
	dag BE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, BE_VBYTE_PERM_VEC));
	dag BE_MV_VBYTE = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
	sub_64));
	dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
	sub_32);
	dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
	sub_32);

	/* BE variable halfword
	The algorithm here is the same as the LE variable halfword except:
	- The shift in the VMX register is by 0/8 for opposite element numbers so
	we simply AND the element number with 0x4 and multiply by 2
	- The order of elements after the move to GPR is reversed, so we invert
	the bits of the index prior to truncating to the range 0-3
	*/
	dag BE_VHALF_PERM_VEC = (v16i8 (LVSL ZERO8,
	(RLDICR (ANDI8_rec $Idx, 4), 1, 62)));
	dag BE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, BE_VHALF_PERM_VEC));
	dag BE_MV_VHALF = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
	sub_64));
	dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
	sub_32);
	dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
	sub_32);

	/* BE variable word
	The algorithm is the same as the LE variable word except:
	- The shift in the VMX register happens for opposite element numbers
	- The order of elements after the move to GPR is reversed, so we invert
	the bits of the index prior to truncating to the range 0-1
	*/
	dag BE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
	(RLDICR (ANDI8_rec $Idx, 2), 2, 61)));
	dag BE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VWORD_PERM_VEC));
	dag BE_MV_VWORD = (MFVSRD
	(EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
	sub_64));
	dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
	sub_32);
	dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
	sub_32);

	/* BE variable doubleword
	Same as the LE doubleword except we shift in the VMX register for opposite
	element indices.
	*/
	dag BE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8,
	(RLDICR (ANDI8_rec $Idx, 1), 3, 60)));
	dag BE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VDWORD_PERM_VEC));
	dag BE_VARIABLE_DWORD =
	(MFVSRD (EXTRACT_SUBREG
	(v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
	sub_64));

	/* BE variable float
	- Shift the vector to line up the desired element to BE Word 0
	- Convert 32-bit float to a 64-bit single precision float
	*/
	dag BE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR $Idx, 2, 61)));
	dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
	dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);

	/* BE variable double
	Same as the BE doubleword except there is no move.
	*/
	dag BE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)),
	(v16i8 (COPY_TO_REGCLASS $S, VRRC)),
	BE_VDWORD_PERM_VEC));
	dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
	}

	def NoP9Altivec : Predicate<"!PPCSubTarget->hasP9Altivec()">;
	let AddedComplexity = 400 in {
	// v4f32 scalar <-> vector conversions (BE)
	let Predicates = [IsBigEndian, HasP8Vector] in {
	def : Pat<(v4f32 (scalar_to_vector f32:$A)),
	(v4f32 (XSCVDPSPN $A))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 0)),
	(f32 (XSCVSPDPN $S))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 1)),
	(f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 2)),
	(f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 3)),
	(f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
	def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
	(f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
	} // IsBigEndian, HasP8Vector

	// Variable index vector_extract for v2f64 does not require P8Vector
	let Predicates = [IsBigEndian, HasVSX] in
	def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
	(f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;

	let Predicates = [IsBigEndian, HasDirectMove] in {
	// v16i8 scalar <-> vector conversions (BE)
	def : Pat<(v16i8 (scalar_to_vector i32:$A)),
	(v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
	def : Pat<(v8i16 (scalar_to_vector i32:$A)),
	(v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
	def : Pat<(v4i32 (scalar_to_vector i32:$A)),
	(v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
	def : Pat<(v2i64 (scalar_to_vector i64:$A)),
	(v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;

	// v2i64 scalar <-> vector conversions (BE)
	def : Pat<(i64 (vector_extract v2i64:$S, 0)),
	(i64 VectorExtractions.LE_DWORD_1)>;
	def : Pat<(i64 (vector_extract v2i64:$S, 1)),
	(i64 VectorExtractions.LE_DWORD_0)>;
	def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
	(i64 VectorExtractions.BE_VARIABLE_DWORD)>;
	} // IsBigEndian, HasDirectMove

	let Predicates = [IsBigEndian, HasDirectMove, NoP9Altivec] in {
	def : Pat<(i32 (vector_extract v16i8:$S, 0)),
	(i32 VectorExtractions.LE_BYTE_15)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 1)),
	(i32 VectorExtractions.LE_BYTE_14)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 2)),
	(i32 VectorExtractions.LE_BYTE_13)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 3)),
	(i32 VectorExtractions.LE_BYTE_12)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 4)),
	(i32 VectorExtractions.LE_BYTE_11)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 5)),
	(i32 VectorExtractions.LE_BYTE_10)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 6)),
	(i32 VectorExtractions.LE_BYTE_9)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 7)),
	(i32 VectorExtractions.LE_BYTE_8)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 8)),
	(i32 VectorExtractions.LE_BYTE_7)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 9)),
	(i32 VectorExtractions.LE_BYTE_6)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 10)),
	(i32 VectorExtractions.LE_BYTE_5)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 11)),
	(i32 VectorExtractions.LE_BYTE_4)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 12)),
	(i32 VectorExtractions.LE_BYTE_3)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 13)),
	(i32 VectorExtractions.LE_BYTE_2)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 14)),
	(i32 VectorExtractions.LE_BYTE_1)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 15)),
	(i32 VectorExtractions.LE_BYTE_0)>;
	def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
	(i32 VectorExtractions.BE_VARIABLE_BYTE)>;

	// v8i16 scalar <-> vector conversions (BE)
	def : Pat<(i32 (vector_extract v8i16:$S, 0)),
	(i32 VectorExtractions.LE_HALF_7)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 1)),
	(i32 VectorExtractions.LE_HALF_6)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 2)),
	(i32 VectorExtractions.LE_HALF_5)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 3)),
	(i32 VectorExtractions.LE_HALF_4)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 4)),
	(i32 VectorExtractions.LE_HALF_3)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 5)),
	(i32 VectorExtractions.LE_HALF_2)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 6)),
	(i32 VectorExtractions.LE_HALF_1)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 7)),
	(i32 VectorExtractions.LE_HALF_0)>;
	def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
	(i32 VectorExtractions.BE_VARIABLE_HALF)>;

	// v4i32 scalar <-> vector conversions (BE)
	def : Pat<(i32 (vector_extract v4i32:$S, 0)),
	(i32 VectorExtractions.LE_WORD_3)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 1)),
	(i32 VectorExtractions.LE_WORD_2)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 2)),
	(i32 VectorExtractions.LE_WORD_1)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 3)),
	(i32 VectorExtractions.LE_WORD_0)>;
	def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
	(i32 VectorExtractions.BE_VARIABLE_WORD)>;
	} // IsBigEndian, HasDirectMove, NoP9Altivec

	// v4f32 scalar <-> vector conversions (LE)
	let Predicates = [IsLittleEndian, HasP8Vector] in {
	def : Pat<(v4f32 (scalar_to_vector f32:$A)),
	(v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 0)),
	(f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 1)),
	(f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 2)),
	(f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
	def : Pat<(f32 (vector_extract v4f32:$S, 3)),
	(f32 (XSCVSPDPN $S))>;
	def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
	(f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
	} // IsLittleEndian, HasP8Vector

	// Variable index vector_extract for v2f64 does not require P8Vector
	let Predicates = [IsLittleEndian, HasVSX] in
	def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
	(f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;

	def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
	(STXVD2X $rS, xoaddr:$dst)>;
	def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
	(STXVW4X $rS, xoaddr:$dst)>;
	def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
	def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;

	// Variable index unsigned vector_extract on Power9
	let Predicates = [HasP9Altivec, IsLittleEndian] in {
	def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
	(VEXTUBRX $Idx, $S)>;

	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
	(VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
	(VEXTUHRX (LI8 0), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
	(VEXTUHRX (LI8 2), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
	(VEXTUHRX (LI8 4), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
	(VEXTUHRX (LI8 6), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
	(VEXTUHRX (LI8 8), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
	(VEXTUHRX (LI8 10), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
	(VEXTUHRX (LI8 12), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
	(VEXTUHRX (LI8 14), $S)>;

	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
	(VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>;
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
	(VEXTUWRX (LI8 0), $S)>;
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
	(VEXTUWRX (LI8 4), $S)>;
	// For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(i32 VectorExtractions.LE_WORD_2), sub_32)>;
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
	(VEXTUWRX (LI8 12), $S)>;

	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
	(EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>;
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
	(EXTSW (VEXTUWRX (LI8 0), $S))>;
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
	(EXTSW (VEXTUWRX (LI8 4), $S))>;
	// For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
	(EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(i32 VectorExtractions.LE_WORD_2), sub_32))>;
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
	(EXTSW (VEXTUWRX (LI8 12), $S))>;

	def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 0)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 1)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 2)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 3)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 4)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 5)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 6)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 7)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 8)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 9)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 10)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 11)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 12)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 13)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 14)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 15)),
	(i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>;

	def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX
	(RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 0)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 1)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 2)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 3)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 4)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 5)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 6)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 6)),
	(i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>;

	def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
	(i32 (EXTRACT_SUBREG (VEXTUWRX
	(RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v4i32:$S, 0)),
	(i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v4i32:$S, 1)),
	(i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>;
	// For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
	def : Pat<(i32 (vector_extract v4i32:$S, 2)),
	(i32 VectorExtractions.LE_WORD_2)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 3)),
	(i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>;
	}

	let Predicates = [HasP9Altivec, IsBigEndian] in {
	def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
	(VEXTUBLX $Idx, $S)>;

	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
	(VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
	(VEXTUHLX (LI8 0), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
	(VEXTUHLX (LI8 2), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
	(VEXTUHLX (LI8 4), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
	(VEXTUHLX (LI8 6), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
	(VEXTUHLX (LI8 8), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
	(VEXTUHLX (LI8 10), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
	(VEXTUHLX (LI8 12), $S)>;
	def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
	(VEXTUHLX (LI8 14), $S)>;

	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
	(VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
	(VEXTUWLX (LI8 0), $S)>;

	// For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(i32 VectorExtractions.LE_WORD_2), sub_32)>;
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
	(VEXTUWLX (LI8 8), $S)>;
	def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
	(VEXTUWLX (LI8 12), $S)>;

	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
	(EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
	(EXTSW (VEXTUWLX (LI8 0), $S))>;
	// For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
	(EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(i32 VectorExtractions.LE_WORD_2), sub_32))>;
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
	(EXTSW (VEXTUWLX (LI8 8), $S))>;
	def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
	(EXTSW (VEXTUWLX (LI8 12), $S))>;

	def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 0)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 1)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 2)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 3)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 4)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 5)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 6)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 7)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 8)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 9)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 10)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 11)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 12)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 13)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 14)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v16i8:$S, 15)),
	(i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>;

	def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX
	(RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 0)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 1)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 2)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 3)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 4)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 5)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 6)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v8i16:$S, 6)),
	(i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>;

	def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
	(i32 (EXTRACT_SUBREG (VEXTUWLX
	(RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v4i32:$S, 0)),
	(i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>;
	// For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
	def : Pat<(i32 (vector_extract v4i32:$S, 1)),
	(i32 VectorExtractions.LE_WORD_2)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 2)),
	(i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>;
	def : Pat<(i32 (vector_extract v4i32:$S, 3)),
	(i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>;
	}

	let Predicates = [IsLittleEndian, HasDirectMove] in {
	// v16i8 scalar <-> vector conversions (LE)
	def : Pat<(v16i8 (scalar_to_vector i32:$A)),
	(v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
	def : Pat<(v8i16 (scalar_to_vector i32:$A)),
	(v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
	def : Pat<(v4i32 (scalar_to_vector i32:$A)),
	(v4i32 MovesToVSR.LE_WORD_0)>;
	def : Pat<(v2i64 (scalar_to_vector i64:$A)),
	(v2i64 MovesToVSR.LE_DWORD_0)>;
	// v2i64 scalar <-> vector conversions (LE)
	def : Pat<(i64 (vector_extract v2i64:$S, 0)),
	(i64 VectorExtractions.LE_DWORD_0)>;
	def : Pat<(i64 (vector_extract v2i64:$S, 1)),
	(i64 VectorExtractions.LE_DWORD_1)>;
	def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
	(i64 VectorExtractions.LE_VARIABLE_DWORD)>;
	} // IsLittleEndian, HasDirectMove

	let Predicates = [IsLittleEndian, HasDirectMove, NoP9Altivec] in {
	def : Pat<(i32 (vector_extract v16i8:$S, 0)),
	(i32 VectorExtractions.LE_BYTE_0)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 1)),
	(i32 VectorExtractions.LE_BYTE_1)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 2)),
	(i32 VectorExtractions.LE_BYTE_2)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 3)),
	(i32 VectorExtractions.LE_BYTE_3)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 4)),
	(i32 VectorExtractions.LE_BYTE_4)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 5)),
	(i32 VectorExtractions.LE_BYTE_5)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 6)),
	(i32 VectorExtractions.LE_BYTE_6)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 7)),
	(i32 VectorExtractions.LE_BYTE_7)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 8)),
	(i32 VectorExtractions.LE_BYTE_8)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 9)),
	(i32 VectorExtractions.LE_BYTE_9)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 10)),
	(i32 VectorExtractions.LE_BYTE_10)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 11)),
	(i32 VectorExtractions.LE_BYTE_11)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 12)),
	(i32 VectorExtractions.LE_BYTE_12)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 13)),
	(i32 VectorExtractions.LE_BYTE_13)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 14)),
	(i32 VectorExtractions.LE_BYTE_14)>;
	def : Pat<(i32 (vector_extract v16i8:$S, 15)),
	(i32 VectorExtractions.LE_BYTE_15)>;
	def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
	(i32 VectorExtractions.LE_VARIABLE_BYTE)>;

	// v8i16 scalar <-> vector conversions (LE)
	def : Pat<(i32 (vector_extract v8i16:$S, 0)),
	(i32 VectorExtractions.LE_HALF_0)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 1)),
	(i32 VectorExtractions.LE_HALF_1)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 2)),
	(i32 VectorExtractions.LE_HALF_2)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 3)),
	(i32 VectorExtractions.LE_HALF_3)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 4)),
	(i32 VectorExtractions.LE_HALF_4)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 5)),
	(i32 VectorExtractions.LE_HALF_5)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 6)),
	(i32 VectorExtractions.LE_HALF_6)>;
	def : Pat<(i32 (vector_extract v8i16:$S, 7)),
	(i32 VectorExtractions.LE_HALF_7)>;
	def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
	(i32 VectorExtractions.LE_VARIABLE_HALF)>;

	// v4i32 scalar <-> vector conversions (LE)
	def : Pat<(i32 (vector_extract v4i32:$S, 0)),
	(i32 VectorExtractions.LE_WORD_0)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 1)),
	(i32 VectorExtractions.LE_WORD_1)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 2)),
	(i32 VectorExtractions.LE_WORD_2)>;
	def : Pat<(i32 (vector_extract v4i32:$S, 3)),
	(i32 VectorExtractions.LE_WORD_3)>;
	def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
	(i32 VectorExtractions.LE_VARIABLE_WORD)>;
	} // IsLittleEndian, HasDirectMove, NoP9Altivec

	let Predicates = [HasDirectMove, HasVSX] in {
	// bitconvert f32 -> i32
	// (convert to 32-bit fp single, shift right 1 word, move to GPR)
	def : Pat<(i32 (bitconvert f32:$S)),
	(i32 (MFVSRWZ (EXTRACT_SUBREG
	(XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
	sub_64)))>;
	// bitconvert i32 -> f32
	// (move to FPR, shift left 1 word, convert to 64-bit fp single)
	def : Pat<(f32 (bitconvert i32:$A)),
	(f32 (XSCVSPDPN
	(XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;

	// bitconvert f64 -> i64
	// (move to GPR, nothing else needed)
	def : Pat<(i64 (bitconvert f64:$S)),
	(i64 (MFVSRD $S))>;

	// bitconvert i64 -> f64
	// (move to FPR, nothing else needed)
	def : Pat<(f64 (bitconvert i64:$S)),
	(f64 (MTVSRD $S))>;

	// Rounding to integer.
	def : Pat<(i64 (lrint f64:$S)),
	(i64 (MFVSRD (FCTID $S)))>;
	def : Pat<(i64 (lrint f32:$S)),
	(i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
	def : Pat<(i64 (llrint f64:$S)),
	(i64 (MFVSRD (FCTID $S)))>;
	def : Pat<(i64 (llrint f32:$S)),
	(i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
	def : Pat<(i64 (lround f64:$S)),
	(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
	def : Pat<(i64 (lround f32:$S)),
	(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
	def : Pat<(i64 (llround f64:$S)),
	(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
	def : Pat<(i64 (llround f32:$S)),
	(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
	}

	let Predicates = [HasVSX] in {
	// Rounding for single precision.
	def : Pat<(f32 (fround f32:$S)),
	(f32 (COPY_TO_REGCLASS (XSRDPI
	(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
	def : Pat<(f32 (fnearbyint f32:$S)),
	(f32 (COPY_TO_REGCLASS (XSRDPIC
	(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
	def : Pat<(f32 (ffloor f32:$S)),
	(f32 (COPY_TO_REGCLASS (XSRDPIM
	(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
	def : Pat<(f32 (fceil f32:$S)),
	(f32 (COPY_TO_REGCLASS (XSRDPIP
	(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
	def : Pat<(f32 (ftrunc f32:$S)),
	(f32 (COPY_TO_REGCLASS (XSRDPIZ
	(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
	}

	// Materialize a zero-vector of long long
	def : Pat<(v2i64 immAllZerosV),
	(v2i64 (XXLXORz))>;
	}

	def AlignValues {
	dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3));
	dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
	}

	// The following VSX instructions were introduced in Power ISA 3.0
	def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
	let AddedComplexity = 400, Predicates = [HasP9Vector] in {

	// [PO VRT XO VRB XO /]
	class X_VT5_XO5_VB5<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
	list<dag> pattern>
	: X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vrrc:$vB),
	!strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;

	// [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
	class X_VT5_XO5_VB5_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
	list<dag> pattern>
	: X_VT5_XO5_VB5<opcode, xo2, xo, opc, pattern>, isRecordForm;

	// [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less),
	// So we use different operand class for VRB
	class X_VT5_XO5_VB5_TyVB<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
	RegisterOperand vbtype, list<dag> pattern>
	: X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
	!strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;

	// [PO VRT XO VRB XO /]
	class X_VT5_XO5_VB5_VSFR<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
	list<dag> pattern>
	: X_RD5_XO5_RS5<opcode, xo2, xo, (outs vfrc:$vT), (ins vrrc:$vB),
	!strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;

	// [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
	class X_VT5_XO5_VB5_VSFR_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
	list<dag> pattern>
	: X_VT5_XO5_VB5_VSFR<opcode, xo2, xo, opc, pattern>, isRecordForm;

	// [PO T XO B XO BX /]
	class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
	list<dag> pattern>
	: XX2_RD5_XO5_RS6<opcode, xo2, xo, (outs g8rc:$rT), (ins vsfrc:$XB),
	!strconcat(opc, " $rT, $XB"), IIC_VecFP, pattern>;

	// [PO T XO B XO BX TX]
	class XX2_XT6_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
	RegisterOperand vtype, list<dag> pattern>
	: XX2_RD6_XO5_RS6<opcode, xo2, xo, (outs vtype:$XT), (ins vtype:$XB),
	!strconcat(opc, " $XT, $XB"), IIC_VecFP, pattern>;

	// [PO T A B XO AX BX TX], src and dest register use different operand class
	class XX3_XT5_XA5_XB5<bits<6> opcode, bits<8> xo, string opc,
	RegisterOperand xty, RegisterOperand aty, RegisterOperand bty,
	InstrItinClass itin, list<dag> pattern>
	: XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
	!strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;

	// [PO VRT VRA VRB XO /]
	class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
	list<dag> pattern>
	: XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>;

	// [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
	class X_VT5_VA5_VB5_Ro<bits<6> opcode, bits<10> xo, string opc,
	list<dag> pattern>
	: X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isRecordForm;

	// [PO VRT VRA VRB XO /]
	class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
	list<dag> pattern>
	: XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vTi, vrrc:$vA, vrrc:$vB),
	!strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>,
	RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">;

	// [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
	class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
	list<dag> pattern>
	: X_VT5_VA5_VB5_FMA<opcode, xo, opc, pattern>, isRecordForm;

	//===--------------------------------------------------------------------===//
	// Quad-Precision Scalar Move Instructions:

	// Copy Sign
	def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp",
	[(set f128:$vT,
	(fcopysign f128:$vB, f128:$vA))]>;

	// Absolute/Negative-Absolute/Negate
	def XSABSQP : X_VT5_XO5_VB5<63, 0, 804, "xsabsqp",
	[(set f128:$vT, (fabs f128:$vB))]>;
	def XSNABSQP : X_VT5_XO5_VB5<63, 8, 804, "xsnabsqp",
	[(set f128:$vT, (fneg (fabs f128:$vB)))]>;
	def XSNEGQP : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp",
	[(set f128:$vT, (fneg f128:$vB))]>;

	//===--------------------------------------------------------------------===//
	// Quad-Precision Scalar Floating-Point Arithmetic Instructions:

	// Add/Divide/Multiply/Subtract
	let isCommutable = 1 in {
	def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp",
	[(set f128:$vT, (fadd f128:$vA, f128:$vB))]>;
	def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp",
	[(set f128:$vT, (fmul f128:$vA, f128:$vB))]>;
	}
	def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" ,
	[(set f128:$vT, (fsub f128:$vA, f128:$vB))]>;
	def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp",
	[(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>;
	// Square-Root
	def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp",
	[(set f128:$vT, (fsqrt f128:$vB))]>;
	// (Negative) Multiply-{Add/Subtract}
	def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
	[(set f128:$vT,
	(fma f128:$vA, f128:$vB,
	f128:$vTi))]>;
	def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" ,
	[(set f128:$vT,
	(fma f128:$vA, f128:$vB,
	(fneg f128:$vTi)))]>;
	def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
	[(set f128:$vT,
	(fneg (fma f128:$vA, f128:$vB,
	f128:$vTi)))]>;
	def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
	[(set f128:$vT,
	(fneg (fma f128:$vA, f128:$vB,
	(fneg f128:$vTi))))]>;

	let isCommutable = 1 in {
	def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo",
	[(set f128:$vT,
	(int_ppc_addf128_round_to_odd
	f128:$vA, f128:$vB))]>;
	def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo",
	[(set f128:$vT,
	(int_ppc_mulf128_round_to_odd
	f128:$vA, f128:$vB))]>;
	}
	def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo",
	[(set f128:$vT,
	(int_ppc_subf128_round_to_odd
	f128:$vA, f128:$vB))]>;
	def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo",
	[(set f128:$vT,
	(int_ppc_divf128_round_to_odd
	f128:$vA, f128:$vB))]>;
	def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo",
	[(set f128:$vT,
	(int_ppc_sqrtf128_round_to_odd f128:$vB))]>;


	def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo",
	[(set f128:$vT,
	(int_ppc_fmaf128_round_to_odd
	f128:$vA,f128:$vB,f128:$vTi))]>;

	def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" ,
	[(set f128:$vT,
	(int_ppc_fmaf128_round_to_odd
	f128:$vA, f128:$vB, (fneg f128:$vTi)))]>;
	def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo",
	[(set f128:$vT,
	(fneg (int_ppc_fmaf128_round_to_odd
	f128:$vA, f128:$vB, f128:$vTi)))]>;
	def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo",
	[(set f128:$vT,
	(fneg (int_ppc_fmaf128_round_to_odd
	f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;

	// Additional fnmsub patterns: -ab + c == -(ab - c)
	def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>;
	def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>;

	//===--------------------------------------------------------------------===//
	// Quad/Double-Precision Compare Instructions:

	// [PO BF // VRA VRB XO /]
	class X_BF3_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
	list<dag> pattern>
	: XForm_17<opcode, xo, (outs crrc:$crD), (ins vrrc:$VA, vrrc:$VB),
	!strconcat(opc, " $crD, $VA, $VB"), IIC_FPCompare> {
	let Pattern = pattern;
	}

	// QP Compare Ordered/Unordered
	def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
	def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;

	// DP/QP Compare Exponents
	def XSCMPEXPDP : XX3Form_1<60, 59,
	(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
	"xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
	def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;

	// DP Compare ==, >=, >, !=
	// Use vsrc for XT, because the entire register of XT is set.
	// XT.dword[1] = 0x0000_0000_0000_0000
	def XSCMPEQDP : XX3_XT5_XA5_XB5<60, 3, "xscmpeqdp", vsrc, vsfrc, vsfrc,
	IIC_FPCompare, []>;
	def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc,
	IIC_FPCompare, []>;
	def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
	IIC_FPCompare, []>;

	//===--------------------------------------------------------------------===//
	// Quad-Precision Floating-Point Conversion Instructions:

	// Convert DP -> QP
	def XSCVDPQP : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc,
	[(set f128:$vT, (fpextend f64:$vB))]>;

	// Round & Convert QP -> DP (dword[1] is set to zero)
	def XSCVQPDP : X_VT5_XO5_VB5_VSFR<63, 20, 836, "xscvqpdp" , []>;
	def XSCVQPDPO : X_VT5_XO5_VB5_VSFR_Ro<63, 20, 836, "xscvqpdpo",
	[(set f64:$vT,
	(int_ppc_truncf128_round_to_odd
	f128:$vB))]>;

	// Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
	def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
	def XSCVQPSWZ : X_VT5_XO5_VB5<63, 9, 836, "xscvqpswz", []>;
	def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
	def XSCVQPUWZ : X_VT5_XO5_VB5<63, 1, 836, "xscvqpuwz", []>;

	// Convert (Un)Signed DWord -> QP.
	def XSCVSDQP : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
	def : Pat<(f128 (sint_to_fp i64:$src)),
	(f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
	def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
	(f128 (XSCVSDQP $src))>;
	def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
	(f128 (XSCVSDQP (VEXTSW2Ds $src)))>;

	def XSCVUDQP : X_VT5_XO5_VB5_TyVB<63, 2, 836, "xscvudqp", vfrc, []>;
	def : Pat<(f128 (uint_to_fp i64:$src)),
	(f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
	def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
	(f128 (XSCVUDQP $src))>;

	// Convert (Un)Signed Word -> QP.
	def : Pat<(f128 (sint_to_fp i32:$src)),
	(f128 (XSCVSDQP (MTVSRWA $src)))>;
	def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
	(f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
	def : Pat<(f128 (uint_to_fp i32:$src)),
	(f128 (XSCVUDQP (MTVSRWZ $src)))>;
	def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
	(f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;

	//===--------------------------------------------------------------------===//
	// Round to Floating-Point Integer Instructions

	// (Round &) Convert DP <-> HP
	// Note! xscvdphp's src and dest register both use the left 64 bits, so we use
	// vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
	// but we still use vsfrc for it.
	def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
	def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;

	// Vector HP -> SP
	def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>;
	def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc,
	[(set v4f32:$XT,
	(int_ppc_vsx_xvcvsphp v4f32:$XB))]>;

	// Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
	// separate pattern so that it can convert the input register class from
	// VRRC(v8i16) to VSRC.
	def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
	(v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;

	class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
	list<dag> pattern>
	: Z23Form_8<opcode, xo,
	(outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
	!strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
	let RC = ex;
	}

	// Round to Quad-Precision Integer [with Inexact]
	def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>;
	def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>;

	// Use current rounding mode
	def : Pat<(f128 (fnearbyint f128:$vB)), (f128 (XSRQPI 0, $vB, 3))>;
	// Round to nearest, ties away from zero
	def : Pat<(f128 (fround f128:$vB)), (f128 (XSRQPI 0, $vB, 0))>;
	// Round towards Zero
	def : Pat<(f128 (ftrunc f128:$vB)), (f128 (XSRQPI 1, $vB, 1))>;
	// Round towards +Inf
	def : Pat<(f128 (fceil f128:$vB)), (f128 (XSRQPI 1, $vB, 2))>;
	// Round towards -Inf
	def : Pat<(f128 (ffloor f128:$vB)), (f128 (XSRQPI 1, $vB, 3))>;

	// Use current rounding mode, [with Inexact]
	def : Pat<(f128 (frint f128:$vB)), (f128 (XSRQPIX 0, $vB, 3))>;

	// Round Quad-Precision to Double-Extended Precision (fp80)
	def XSRQPXP : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;

	//===--------------------------------------------------------------------===//
	// Insert/Extract Instructions

	// Insert Exponent DP/QP
	// XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
	def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
	"xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
	// vB NOTE: only vB.dword[0] is used, that's why we don't use
	// X_VT5_VA5_VB5 form
	def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
	"xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>;

	def : Pat<(f128 (int_ppc_scalar_insert_exp_qp f128:$vA, i64:$vB)),
	(f128 (XSIEXPQP $vA, (MTVSRD $vB)))>;

	// Extract Exponent/Significand DP/QP
	def XSXEXPDP : XX2_RT5_XO5_XB6<60, 0, 347, "xsxexpdp", []>;
	def XSXSIGDP : XX2_RT5_XO5_XB6<60, 1, 347, "xsxsigdp", []>;

	def XSXEXPQP : X_VT5_XO5_VB5 <63, 2, 804, "xsxexpqp", []>;
	def XSXSIGQP : X_VT5_XO5_VB5 <63, 18, 804, "xsxsigqp", []>;

	def : Pat<(i64 (int_ppc_scalar_extract_expq f128:$vA)),
	(i64 (MFVSRD (EXTRACT_SUBREG
	(v2i64 (XSXEXPQP $vA)), sub_64)))>;

	// Vector Insert Word
	// XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
	def XXINSERTW :
	XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
	(ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
	"xxinsertw $XT, $XB, $UIM", IIC_VecFP,
	[(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
	imm32SExt16:$UIM))]>,
	RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;

	// Vector Extract Unsigned Word
	def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
	(outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
	"xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;

	// Vector Insert Exponent DP/SP
	def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
	IIC_VecFP, [(set v2f64: $XT,(int_ppc_vsx_xviexpdp v2i64:$XA, v2i64:$XB))]>;
	def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc,
	IIC_VecFP, [(set v4f32: $XT,(int_ppc_vsx_xviexpsp v4i32:$XA, v4i32:$XB))]>;

	// Vector Extract Exponent/Significand DP/SP
	def XVXEXPDP : XX2_XT6_XO5_XB6<60, 0, 475, "xvxexpdp", vsrc,
	[(set v2i64: $XT,
	(int_ppc_vsx_xvxexpdp v2f64:$XB))]>;
	def XVXEXPSP : XX2_XT6_XO5_XB6<60, 8, 475, "xvxexpsp", vsrc,
	[(set v4i32: $XT,
	(int_ppc_vsx_xvxexpsp v4f32:$XB))]>;
	def XVXSIGDP : XX2_XT6_XO5_XB6<60, 1, 475, "xvxsigdp", vsrc,
	[(set v2i64: $XT,
	(int_ppc_vsx_xvxsigdp v2f64:$XB))]>;
	def XVXSIGSP : XX2_XT6_XO5_XB6<60, 9, 475, "xvxsigsp", vsrc,
	[(set v4i32: $XT,
	(int_ppc_vsx_xvxsigsp v4f32:$XB))]>;

	let AddedComplexity = 400, Predicates = [HasP9Vector] in {
	// Extra patterns expanding to vector Extract Word/Insert Word
	def : Pat<(v4i32 (int_ppc_vsx_xxinsertw v4i32:$A, v2i64:$B, imm:$IMM)),
	(v4i32 (XXINSERTW $A, $B, imm:$IMM))>;
	def : Pat<(v2i64 (int_ppc_vsx_xxextractuw v2i64:$A, imm:$IMM)),
	(v2i64 (COPY_TO_REGCLASS (XXEXTRACTUW $A, imm:$IMM), VSRC))>;
	} // AddedComplexity = 400, HasP9Vector

	//===--------------------------------------------------------------------===//

	// Test Data Class SP/DP/QP
	def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
	(outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
	"xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
	def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
	(outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
	"xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
	def XSTSTDCQP : X_BF3_DCMX7_RS5 <63, 708,
	(outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
	"xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;

	// Vector Test Data Class SP/DP
	def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
	(outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
	"xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
	[(set v4i32: $XT,
	(int_ppc_vsx_xvtstdcsp v4f32:$XB, timm:$DCMX))]>;
	def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
	(outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
	"xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
	[(set v2i64: $XT,
	(int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;

	//===--------------------------------------------------------------------===//

	// Maximum/Minimum Type-C/Type-J DP
	def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc,
	IIC_VecFP,
	[(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>;
	def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc,
	IIC_VecFP, []>;
	def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsfrc, vsfrc, vsfrc,
	IIC_VecFP,
	[(set f64:$XT, (PPCxsminc f64:$XA, f64:$XB))]>;
	def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
	IIC_VecFP, []>;

	//===--------------------------------------------------------------------===//

	// Vector Byte-Reverse H/W/D/Q Word
	def XXBRH : XX2_XT6_XO5_XB6<60, 7, 475, "xxbrh", vsrc, []>;
	def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc,
	[(set v4i32:$XT, (bswap v4i32:$XB))]>;
	def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc,
	[(set v2i64:$XT, (bswap v2i64:$XB))]>;
	def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;

	// Vector Reverse
	def : Pat<(v8i16 (bswap v8i16 :$A)),
	(v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
	def : Pat<(v1i128 (bswap v1i128 :$A)),
	(v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;

	// Vector Permute
	def XXPERM : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
	IIC_VecPerm, []>;
	def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc,
	IIC_VecPerm, []>;

	// Vector Splat Immediate Byte
	def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
	"xxspltib $XT, $IMM8", IIC_VecPerm, []>;

	//===--------------------------------------------------------------------===//
	// Vector/Scalar Load/Store Instructions

	// When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
	// PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
	// Load Vector
	def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
	"lxv $XT, $src", IIC_LdStLFD, []>;
	// Load DWord
	def LXSD : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
	"lxsd $vD, $src", IIC_LdStLFD, []>;
	// Load SP from src, convert it to DP, and place in dword[0]
	def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src),
	"lxssp $vD, $src", IIC_LdStLFD, []>;

	// [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
	// "out" and "in" dag
	class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
	RegisterOperand vtype, list<dag> pattern>
	: XX1Form_memOp<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
	!strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>;

	// Load as Integer Byte/Halfword & Zero Indexed
	def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
	[(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
	def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
	[(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;

	// Load Vector Halfword8/Byte16 Indexed
	def LXVH8X : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
	def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;

	// Load Vector Indexed
	def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
	[(set v2f64:$XT, (load xaddrX16:$src))]>;
	// Load Vector (Left-justified) with Length
	def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
	"lxvl $XT, $src, $rB", IIC_LdStLoad,
	[(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>;
	def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
	"lxvll $XT, $src, $rB", IIC_LdStLoad,
	[(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>;

	// Load Vector Word & Splat Indexed
	def LXVWSX : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
	} // mayLoad

	// When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
	// PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
	let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
	// Store Vector
	def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
	"stxv $XT, $dst", IIC_LdStSTFD, []>;
	// Store DWord
	def STXSD : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
	"stxsd $vS, $dst", IIC_LdStSTFD, []>;
	// Convert DP of dword[0] to SP, and Store to dst
	def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst),
	"stxssp $vS, $dst", IIC_LdStSTFD, []>;

	// [PO S RA RB XO SX]
	class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
	RegisterOperand vtype, list<dag> pattern>
	: XX1Form_memOp<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
	!strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>;

	// Store as Integer Byte/Halfword Indexed
	def STXSIBX : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsfrc,
	[(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
	def STXSIHX : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsfrc,
	[(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
	let isCodeGenOnly = 1 in {
	def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsrc, []>;
	def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsrc, []>;
	}

	// Store Vector Halfword8/Byte16 Indexed
	def STXVH8X : X_XS6_RA5_RB5<31, 940, "stxvh8x" , vsrc, []>;
	def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;

	// Store Vector Indexed
	def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc,
	[(store v2f64:$XT, xaddrX16:$dst)]>;

	// Store Vector (Left-justified) with Length
	def STXVL : XX1Form_memOp<31, 397, (outs),
	(ins vsrc:$XT, memr:$dst, g8rc:$rB),
	"stxvl $XT, $dst, $rB", IIC_LdStLoad,
	[(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst,
	i64:$rB)]>;
	def STXVLL : XX1Form_memOp<31, 429, (outs),
	(ins vsrc:$XT, memr:$dst, g8rc:$rB),
	"stxvll $XT, $dst, $rB", IIC_LdStLoad,
	[(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst,
	i64:$rB)]>;
	} // mayStore

	let Predicates = [IsLittleEndian] in {
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
	}

	let Predicates = [IsBigEndian] in {
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
	def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
	(f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
	def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
	(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
	}

	// Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
	// of f64
	def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
	(v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
	def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
	(v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;

	// Patterns for which instructions from ISA 3.0 are a better match
	let Predicates = [IsLittleEndian, HasP9Vector] in {
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;

	def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
	(COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
	def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
	(STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;

	def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
	(COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
	def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
	(STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
	} // IsLittleEndian, HasP9Vector

	let Predicates = [IsBigEndian, HasP9Vector] in {
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
	def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
	(f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
	def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
	(f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
	def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
	(v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
	def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
	(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
	} // IsBigEndian, HasP9Vector

	// D-Form Load/Store
	def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
	def : Pat<(v4f32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
	def : Pat<(v2i64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
	def : Pat<(v2f64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
	def : Pat<(f128 (quadwOffsetLoad iaddrX16:$src)),
	(COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
	def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddrX16:$src)), (LXV memrix16:$src)>;
	def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddrX16:$src)), (LXV memrix16:$src)>;

	def : Pat<(quadwOffsetStore v4f32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
	def : Pat<(quadwOffsetStore v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
	def : Pat<(quadwOffsetStore v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
	def : Pat<(quadwOffsetStore f128:$rS, iaddrX16:$dst),
	(STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
	def : Pat<(quadwOffsetStore v2i64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
	def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddrX16:$dst),
	(STXV $rS, memrix16:$dst)>;
	def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddrX16:$dst),
	(STXV $rS, memrix16:$dst)>;


	def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
	def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
	def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
	def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
	def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
	def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
	def : Pat<(f128 (nonQuadwOffsetLoad xoaddr:$src)),
	(COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>;
	def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst),
	(STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
	def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
	(STXVX $rS, xoaddr:$dst)>;
	def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
	(STXVX $rS, xoaddr:$dst)>;
	def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
	(STXVX $rS, xoaddr:$dst)>;
	def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
	(STXVX $rS, xoaddr:$dst)>;
	def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
	(STXVX $rS, xoaddr:$dst)>;
	def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
	(STXVX $rS, xoaddr:$dst)>;

	let AddedComplexity = 400 in {
	// LIWAX - This instruction is used for sign extending i32 -> i64.
	// LIWZX - This instruction will be emitted for i32, f32, and when
	// zero-extending i32 to i64 (zext i32 -> i64).
	let Predicates = [IsLittleEndian] in {

	def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
	(v2i64 (XXPERMDIs
	(COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC), 2))>;

	def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
	(v2i64 (XXPERMDIs
	(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;

	def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
	(v4i32 (XXPERMDIs
	(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;

	def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
	(v4f32 (XXPERMDIs
	(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
	}

	let Predicates = [IsBigEndian] in {
	def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
	(v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>;

	def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
	(v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>;

	def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
	(v4i32 (XXSLDWIs
	(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;

	def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
	(v4f32 (XXSLDWIs
	(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
	}

	}

	// Build vectors from i8 loads
	def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
	(v16i8 (VSPLTBs 7, (LXSIBZX xoaddr:$src)))>;
	def : Pat<(v8i16 (scalar_to_vector ScalarLoads.ZELi8)),
	(v8i16 (VSPLTHs 3, (LXSIBZX xoaddr:$src)))>;
	def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)),
	(v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>;
	def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)),
	(v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>;
	def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)),
	(v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>;
	def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)),
	(v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>;

	// Build vectors from i16 loads
	def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)),
	(v8i16 (VSPLTHs 3, (LXSIHZX xoaddr:$src)))>;
	def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)),
	(v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>;
	def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)),
	(v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>;
	def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)),
	(v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>;
	def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
	(v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;

	+ // Load/convert and convert/store patterns for f16.
	+ def : Pat<(f64 (extloadf16 xoaddr:$src)),
	+ (f64 (XSCVHPDP (LXSIHZX xoaddr:$src)))>;
	+ def : Pat<(truncstoref16 f64:$src, xoaddr:$dst),
	+ (STXSIHX (XSCVDPHP $src), xoaddr:$dst)>;
	+ def : Pat<(f32 (extloadf16 xoaddr:$src)),
	+ (f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX xoaddr:$src)), VSSRC))>;
	+ def : Pat<(truncstoref16 f32:$src, xoaddr:$dst),
	+ (STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), xoaddr:$dst)>;
	+ def : Pat<(f64 (f16_to_fp i32:$A)),
	+ (f64 (XSCVHPDP (MTVSRWZ $A)))>;
	+ def : Pat<(f32 (f16_to_fp i32:$A)),
	+ (f32 (COPY_TO_REGCLASS (XSCVHPDP (MTVSRWZ $A)), VSSRC))>;
	+ def : Pat<(i32 (fp_to_f16 f32:$A)),
	+ (i32 (MFVSRWZ (XSCVDPHP (COPY_TO_REGCLASS $A, VSFRC))))>;
	+ def : Pat<(i32 (fp_to_f16 f64:$A)), (i32 (MFVSRWZ (XSCVDPHP $A)))>;
	+
	let Predicates = [IsBigEndian, HasP9Vector] in {
	// Scalar stores of i8
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;

	// Scalar stores of i16
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
	} // IsBigEndian, HasP9Vector

	let Predicates = [IsLittleEndian, HasP9Vector] in {
	// Scalar stores of i8
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
	(STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;

	// Scalar stores of i16
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
	def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
	(STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
	} // IsLittleEndian, HasP9Vector


	// Vector sign extensions
	def : Pat<(f64 (PPCVexts f64:$A, 1)),
	(f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>;
	def : Pat<(f64 (PPCVexts f64:$A, 2)),
	(f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;

	def DFLOADf32 : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src),
	"#DFLOADf32",
	[(set f32:$XT, (load iaddrX4:$src))]>;
	def DFLOADf64 : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src),
	"#DFLOADf64",
	[(set f64:$XT, (load iaddrX4:$src))]>;
	def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst),
	"#DFSTOREf32",
	[(store f32:$XT, iaddrX4:$dst)]>;
	def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
	"#DFSTOREf64",
	[(store f64:$XT, iaddrX4:$dst)]>;

	def : Pat<(f64 (extloadf32 iaddrX4:$src)),
	(COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$src), VSFRC)>;
	def : Pat<(f32 (fpround (f64 (extloadf32 iaddrX4:$src)))),
	(f32 (DFLOADf32 iaddrX4:$src))>;

	def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)),
	(COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>;
	def : Pat<(v4f32 (PPCldvsxlh iaddrX4:$src)),
	(COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC)>;

	let AddedComplexity = 400 in {
	// The following pseudoinstructions are used to ensure the utilization
	// of all 64 VSX registers.
	let Predicates = [IsLittleEndian, HasP9Vector] in {
	def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
	(v2i64 (XXPERMDIs
	(COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>;
	def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
	(v2i64 (XXPERMDIs
	(COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>;

	def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
	(v2f64 (XXPERMDIs
	(COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>;
	def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
	(v2f64 (XXPERMDIs
	(COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), xaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), xaddrX4:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), iaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
	iaddrX4:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
	} // IsLittleEndian, HasP9Vector

	let Predicates = [IsBigEndian, HasP9Vector] in {
	def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
	(v2i64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
	def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
	(v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;

	def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
	(v2f64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
	def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
	(v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), xaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), xaddrX4:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), iaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
	sub_64), iaddrX4:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
	(DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
	} // IsBigEndian, HasP9Vector
	}

	let Predicates = [IsBigEndian, HasP9Vector] in {

	// (Un)Signed DWord vector extract -> QP
	def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
	(f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
	def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
	(f128 (XSCVSDQP
	(EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
	def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
	(f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
	def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
	(f128 (XSCVUDQP
	(EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;

	// (Un)Signed Word vector extract -> QP
	def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 1)))),
	(f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
	foreach Idx = [0,2,3] in {
	def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
	(f128 (XSCVSDQP (EXTRACT_SUBREG
	(VEXTSW2D (VSPLTW Idx, $src)), sub_64)))>;
	}
	foreach Idx = 0-3 in {
	def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
	(f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>;
	}

	// (Un)Signed HWord vector extract -> QP
	foreach Idx = 0-7 in {
	def : Pat<(f128 (sint_to_fp
	(i32 (sext_inreg
	(vector_extract v8i16:$src, Idx), i16)))),
	(f128 (XSCVSDQP (EXTRACT_SUBREG
	(VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
	sub_64)))>;
	// The SDAG adds the `and` since an `i16` is being extracted as an `i32`.
	def : Pat<(f128 (uint_to_fp
	(and (i32 (vector_extract v8i16:$src, Idx)), 65535))),
	(f128 (XSCVUDQP (EXTRACT_SUBREG
	(VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
	}

	// (Un)Signed Byte vector extract -> QP
	foreach Idx = 0-15 in {
	def : Pat<(f128 (sint_to_fp
	(i32 (sext_inreg (vector_extract v16i8:$src, Idx),
	i8)))),
	(f128 (XSCVSDQP (EXTRACT_SUBREG
	(VEXTSB2D (VEXTRACTUB Idx, $src)), sub_64)))>;
	def : Pat<(f128 (uint_to_fp
	(and (i32 (vector_extract v16i8:$src, Idx)), 255))),
	(f128 (XSCVUDQP
	(EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>;
	}

	// Unsiged int in vsx register -> QP
	def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
	(f128 (XSCVUDQP
	(XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
	} // IsBigEndian, HasP9Vector

	let Predicates = [IsLittleEndian, HasP9Vector] in {

	// (Un)Signed DWord vector extract -> QP
	def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
	(f128 (XSCVSDQP
	(EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
	def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
	(f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
	def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
	(f128 (XSCVUDQP
	(EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
	def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
	(f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;

	// (Un)Signed Word vector extract -> QP
	foreach Idx = [[0,3],[1,2],[3,0]] in {
	def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
	(f128 (XSCVSDQP (EXTRACT_SUBREG
	(VEXTSW2D (VSPLTW !head(!tail(Idx)), $src)),
	sub_64)))>;
	}
	def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 2)))),
	(f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;

	foreach Idx = [[0,12],[1,8],[2,4],[3,0]] in {
	def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
	(f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>;
	}

	// (Un)Signed HWord vector extract -> QP
	// The Nested foreach lists identifies the vector element and corresponding
	// register byte location.
	foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in {
	def : Pat<(f128 (sint_to_fp
	(i32 (sext_inreg
	(vector_extract v8i16:$src, !head(Idx)), i16)))),
	(f128 (XSCVSDQP
	(EXTRACT_SUBREG (VEXTSH2D
	(VEXTRACTUH !head(!tail(Idx)), $src)),
	sub_64)))>;
	def : Pat<(f128 (uint_to_fp
	(and (i32 (vector_extract v8i16:$src, !head(Idx))),
	65535))),
	(f128 (XSCVUDQP (EXTRACT_SUBREG
	(VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
	}

	// (Un)Signed Byte vector extract -> QP
	foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7],
	[9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in {
	def : Pat<(f128 (sint_to_fp
	(i32 (sext_inreg
	(vector_extract v16i8:$src, !head(Idx)), i8)))),
	(f128 (XSCVSDQP
	(EXTRACT_SUBREG
	(VEXTSB2D (VEXTRACTUB !head(!tail(Idx)), $src)),
	sub_64)))>;
	def : Pat<(f128 (uint_to_fp
	(and (i32 (vector_extract v16i8:$src, !head(Idx))),
	255))),
	(f128 (XSCVUDQP
	(EXTRACT_SUBREG
	(VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
	}

	// Unsiged int in vsx register -> QP
	def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
	(f128 (XSCVUDQP
	(XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 8)))>;
	} // IsLittleEndian, HasP9Vector

	// Convert (Un)Signed DWord in memory -> QP
	def : Pat<(f128 (sint_to_fp (i64 (load xaddrX4:$src)))),
	(f128 (XSCVSDQP (LXSDX xaddrX4:$src)))>;
	def : Pat<(f128 (sint_to_fp (i64 (load iaddrX4:$src)))),
	(f128 (XSCVSDQP (LXSD iaddrX4:$src)))>;
	def : Pat<(f128 (uint_to_fp (i64 (load xaddrX4:$src)))),
	(f128 (XSCVUDQP (LXSDX xaddrX4:$src)))>;
	def : Pat<(f128 (uint_to_fp (i64 (load iaddrX4:$src)))),
	(f128 (XSCVUDQP (LXSD iaddrX4:$src)))>;

	// Convert Unsigned HWord in memory -> QP
	def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
	(f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>;

	// Convert Unsigned Byte in memory -> QP
	def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
	(f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;

	// Truncate & Convert QP -> (Un)Signed (D)Word.
	def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
	def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
	def : Pat<(i32 (fp_to_sint f128:$src)),
	(i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
	def : Pat<(i32 (fp_to_uint f128:$src)),
	(i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;

	// Instructions for store(fptosi).
	// The 8-byte version is repeated here due to availability of D-Form STXSD.
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddrX4:$dst, 8),
	(STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
	xaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), iaddrX4:$dst, 8),
	(STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
	iaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
	(STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2),
	(STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
	(STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddrX4:$dst, 8),
	(STXSDX (XSCVDPSXDS f64:$src), xaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), iaddrX4:$dst, 8),
	(STXSD (XSCVDPSXDS f64:$src), iaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
	(STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1),
	(STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;

	// Instructions for store(fptoui).
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddrX4:$dst, 8),
	(STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
	xaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), iaddrX4:$dst, 8),
	(STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
	iaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
	(STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2),
	(STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
	(STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddrX4:$dst, 8),
	(STXSDX (XSCVDPUXDS f64:$src), xaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), iaddrX4:$dst, 8),
	(STXSD (XSCVDPUXDS f64:$src), iaddrX4:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
	(STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
	def : Pat<(PPCstore_scal_int_from_vsr
	(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1),
	(STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;

	// Round & Convert QP -> DP/SP
	def : Pat<(f64 (fpround f128:$src)), (f64 (XSCVQPDP $src))>;
	def : Pat<(f32 (fpround f128:$src)), (f32 (XSRSP (XSCVQPDPO $src)))>;

	// Convert SP -> QP
	def : Pat<(f128 (fpextend f32:$src)),
	(f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>;

	def : Pat<(f32 (PPCxsmaxc f32:$XA, f32:$XB)),
	(f32 (COPY_TO_REGCLASS (XSMAXCDP (COPY_TO_REGCLASS $XA, VSSRC),
	(COPY_TO_REGCLASS $XB, VSSRC)),
	VSSRC))>;
	def : Pat<(f32 (PPCxsminc f32:$XA, f32:$XB)),
	(f32 (COPY_TO_REGCLASS (XSMINCDP (COPY_TO_REGCLASS $XA, VSSRC),
	(COPY_TO_REGCLASS $XB, VSSRC)),
	VSSRC))>;

	} // end HasP9Vector, AddedComplexity

	let AddedComplexity = 400 in {
	let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsBigEndian] in {
	def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
	(f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
	}
	let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsLittleEndian] in {
	def : Pat<(f128 (PPCbuild_fp128 i64:$rA, i64:$rB)),
	(f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
	}
	}

	let Predicates = [HasP9Vector], hasSideEffects = 0 in {
	let mayStore = 1 in {
	def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
	(ins spilltovsrrc:$XT, memrr:$dst),
	"#SPILLTOVSR_STX", []>;
	def SPILLTOVSR_ST : PPCPostRAExpPseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
	"#SPILLTOVSR_ST", []>;
	}
	let mayLoad = 1 in {
	def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
	(ins memrr:$src),
	"#SPILLTOVSR_LDX", []>;
	def SPILLTOVSR_LD : PPCPostRAExpPseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
	"#SPILLTOVSR_LD", []>;

	}
	}
	// Integer extend helper dags 32 -> 64
	def AnyExts {
	dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32);
	dag B = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $B, sub_32);
	dag C = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $C, sub_32);
	dag D = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $D, sub_32);
	}

	def DblToFlt {
	dag A0 = (f32 (fpround (f64 (extractelt v2f64:$A, 0))));
	dag A1 = (f32 (fpround (f64 (extractelt v2f64:$A, 1))));
	dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0))));
	dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
	}

	def ExtDbl {
	dag A0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 0))))));
	dag A1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 1))))));
	dag B0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 0))))));
	dag B1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 1))))));
	dag A0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 0))))));
	dag A1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 1))))));
	dag B0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 0))))));
	dag B1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 1))))));
	}

	def ByteToWord {
	dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
	dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
	dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
	dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
	dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8));
	dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8));
	dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8));
	dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8));
	}

	def ByteToDWord {
	dag LE_A0 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
	dag LE_A1 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
	dag BE_A0 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8));
	dag BE_A1 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8));
	}

	def HWordToWord {
	dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
	dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
	dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
	dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
	dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16));
	dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16));
	dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16));
	dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16));
	}

	def HWordToDWord {
	dag LE_A0 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
	dag LE_A1 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
	dag BE_A0 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16));
	dag BE_A1 = (i64 (sext_inreg
	(i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16));
	}

	def WordToDWord {
	dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
	dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
	dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1))));
	dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3))));
	}

	def FltToIntLoad {
	dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A)))));
	}
	def FltToUIntLoad {
	dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 xoaddr:$A)))));
	}
	def FltToLongLoad {
	dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
	}
	def FltToLongLoadP9 {
	dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddrX4:$A)))));
	}
	def FltToULongLoad {
	dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
	}
	def FltToULongLoadP9 {
	dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddrX4:$A)))));
	}
	def FltToLong {
	dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A)))));
	}
	def FltToULong {
	dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz (fpextend f32:$A)))));
	}
	def DblToInt {
	dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
	dag B = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$B))));
	dag C = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$C))));
	dag D = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$D))));
	}
	def DblToUInt {
	dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A))));
	dag B = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$B))));
	dag C = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$C))));
	dag D = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$D))));
	}
	def DblToLong {
	dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A))));
	}
	def DblToULong {
	dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz f64:$A))));
	}
	def DblToIntLoad {
	dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
	}
	def DblToIntLoadP9 {
	dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddrX4:$A)))));
	}
	def DblToUIntLoad {
	dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
	}
	def DblToUIntLoadP9 {
	dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddrX4:$A)))));
	}
	def DblToLongLoad {
	dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
	}
	def DblToULongLoad {
	dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A)))));
	}

	// FP load dags (for f32 -> v4f32)
	def LoadFP {
	dag A = (f32 (load xoaddr:$A));
	dag B = (f32 (load xoaddr:$B));
	dag C = (f32 (load xoaddr:$C));
	dag D = (f32 (load xoaddr:$D));
	}

	// FP merge dags (for f32 -> v4f32)
	def MrgFP {
	dag LD32A = (COPY_TO_REGCLASS (LIWZX xoaddr:$A), VSRC);
	dag LD32B = (COPY_TO_REGCLASS (LIWZX xoaddr:$B), VSRC);
	dag LD32C = (COPY_TO_REGCLASS (LIWZX xoaddr:$C), VSRC);
	dag LD32D = (COPY_TO_REGCLASS (LIWZX xoaddr:$D), VSRC);
	dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC),
	(COPY_TO_REGCLASS $C, VSRC), 0));
	dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC),
	(COPY_TO_REGCLASS $D, VSRC), 0));
	dag ABhToFlt = (XVCVDPSP (XXPERMDI $A, $B, 0));
	dag ABlToFlt = (XVCVDPSP (XXPERMDI $A, $B, 3));
	dag BAhToFlt = (XVCVDPSP (XXPERMDI $B, $A, 0));
	dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3));
	}

	// Word-element merge dags - conversions from f64 to i32 merged into vectors.
	def MrgWords {
	// For big endian, we merge low and hi doublewords (A, B).
	dag A0B0 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 0));
	dag A1B1 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 3));
	dag CVA1B1S = (v4i32 (XVCVDPSXWS A1B1));
	dag CVA0B0S = (v4i32 (XVCVDPSXWS A0B0));
	dag CVA1B1U = (v4i32 (XVCVDPUXWS A1B1));
	dag CVA0B0U = (v4i32 (XVCVDPUXWS A0B0));

	// For little endian, we merge low and hi doublewords (B, A).
	dag B1A1 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 0));
	dag B0A0 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 3));
	dag CVB1A1S = (v4i32 (XVCVDPSXWS B1A1));
	dag CVB0A0S = (v4i32 (XVCVDPSXWS B0A0));
	dag CVB1A1U = (v4i32 (XVCVDPUXWS B1A1));
	dag CVB0A0U = (v4i32 (XVCVDPUXWS B0A0));

	// For big endian, we merge hi doublewords of (A, C) and (B, D), convert
	// then merge.
	dag AC = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$A, VSRC),
	(COPY_TO_REGCLASS f64:$C, VSRC), 0));
	dag BD = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$B, VSRC),
	(COPY_TO_REGCLASS f64:$D, VSRC), 0));
	dag CVACS = (v4i32 (XVCVDPSXWS AC));
	dag CVBDS = (v4i32 (XVCVDPSXWS BD));
	dag CVACU = (v4i32 (XVCVDPUXWS AC));
	dag CVBDU = (v4i32 (XVCVDPUXWS BD));

	// For little endian, we merge hi doublewords of (D, B) and (C, A), convert
	// then merge.
	dag DB = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$D, VSRC),
	(COPY_TO_REGCLASS f64:$B, VSRC), 0));
	dag CA = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$C, VSRC),
	(COPY_TO_REGCLASS f64:$A, VSRC), 0));
	dag CVDBS = (v4i32 (XVCVDPSXWS DB));
	dag CVCAS = (v4i32 (XVCVDPSXWS CA));
	dag CVDBU = (v4i32 (XVCVDPUXWS DB));
	dag CVCAU = (v4i32 (XVCVDPUXWS CA));
	}

	// Patterns for BUILD_VECTOR nodes.
	let AddedComplexity = 400 in {

	let Predicates = [HasVSX] in {
	// Build vectors of floating point converted to i32.
	def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A,
	DblToInt.A, DblToInt.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>;
	def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A,
	DblToUInt.A, DblToUInt.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>;
	def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
	(v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC),
	(COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>;
	def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
	(v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC),
	(COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
	def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS
	(XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
	def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS
	(XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
	def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
	(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
	def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
	(v2f64 (LXVDSX xoaddr:$A))>;
	def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
	(v2i64 (LXVDSX xoaddr:$A))>;

	// Build vectors of floating point converted to i64.
	def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
	(v2i64 (XXPERMDIs
	(COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
	def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)),
	(v2i64 (XXPERMDIs
	(COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
	def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)),
	(v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>;
	def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)),
	(v2i64 (XVCVDPUXDS (LXVDSX xoaddr:$A)))>;
	}

	let Predicates = [HasVSX, NoP9Vector] in {
	// Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads).
	def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS
	(XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
	def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS
	(XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
	def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
	(v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
	(XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
	def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
	(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
	(XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
	}

	let Predicates = [IsBigEndian, HasP8Vector] in {
	def : Pat<DWToSPExtractConv.BVU,
	(v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3),
	(XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3)))>;
	def : Pat<DWToSPExtractConv.BVS,
	(v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3),
	(XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3)))>;
	def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
	def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;

	// Elements in a register on a BE system are in order <0, 1, 2, 3>.
	// The store instructions store the second word from the left.
	// So to align element zero, we need to modulo-left-shift by 3 words.
	// Similar logic applies for elements 2 and 3.
	foreach Idx = [ [0,3], [2,1], [3,2] ] in {
	def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
	sub_64), xoaddr:$src)>;
	def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
	sub_64), xoaddr:$src)>;
	}
	}

	let Predicates = [HasP8Vector, IsBigEndian, NoP9Vector] in {
	def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
	xoaddr:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
	xoaddr:$src)>;
	}

	// Big endian, available on all targets with VSX
	let Predicates = [IsBigEndian, HasVSX] in {
	def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
	(v2f64 (XXPERMDI
	(COPY_TO_REGCLASS $A, VSRC),
	(COPY_TO_REGCLASS $B, VSRC), 0))>;
	// Using VMRGEW to assemble the final vector would be a lower latency
	// solution. However, we choose to go with the slightly higher latency
	// XXPERMDI for 2 reasons:
	// 1. This is likely to occur in unrolled loops where regpressure is high,
	// so we want to use the latter as it has access to all 64 VSX registers.
	// 2. Using Altivec instructions in this sequence would likely cause the
	// allocation of Altivec registers even for the loads which in turn would
	// force the use of LXSIWZX for the loads, adding a cycle of latency to
	// each of the loads which would otherwise be able to use LFIWZX.
	def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
	(v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B),
	(XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>;
	def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
	(VMRGEW MrgFP.AC, MrgFP.BD)>;
	def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
	DblToFlt.B0, DblToFlt.B1)),
	(v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;

	// Convert 4 doubles to a vector of ints.
	def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
	DblToInt.C, DblToInt.D)),
	(v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>;
	def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
	DblToUInt.C, DblToUInt.D)),
	(v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>;
	def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
	ExtDbl.B0S, ExtDbl.B1S)),
	(v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>;
	def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
	ExtDbl.B0U, ExtDbl.B1U)),
	(v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>;
	}

	let Predicates = [IsLittleEndian, HasP8Vector] in {
	def : Pat<DWToSPExtractConv.BVU,
	(v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3),
	(XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3)))>;
	def : Pat<DWToSPExtractConv.BVS,
	(v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3),
	(XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3)))>;
	def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
	def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;

	// Elements in a register on a LE system are in order <3, 2, 1, 0>.
	// The store instructions store the second word from the left.
	// So to align element 3, we need to modulo-left-shift by 3 words.
	// Similar logic applies for elements 0 and 1.
	foreach Idx = [ [0,2], [1,1], [3,3] ] in {
	def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
	sub_64), xoaddr:$src)>;
	def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
	(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
	sub_64), xoaddr:$src)>;
	}
	}

	let Predicates = [HasP8Vector, IsLittleEndian, NoP9Vector] in {
	def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
	xoaddr:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
	xoaddr:$src)>;
	def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
	def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
	(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
	}

	let Predicates = [IsLittleEndian, HasVSX] in {
	// Little endian, available on all targets with VSX
	def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
	(v2f64 (XXPERMDI
	(COPY_TO_REGCLASS $B, VSRC),
	(COPY_TO_REGCLASS $A, VSRC), 0))>;
	// Using VMRGEW to assemble the final vector would be a lower latency
	// solution. However, we choose to go with the slightly higher latency
	// XXPERMDI for 2 reasons:
	// 1. This is likely to occur in unrolled loops where regpressure is high,
	// so we want to use the latter as it has access to all 64 VSX registers.
	// 2. Using Altivec instructions in this sequence would likely cause the
	// allocation of Altivec registers even for the loads which in turn would
	// force the use of LXSIWZX for the loads, adding a cycle of latency to
	// each of the loads which would otherwise be able to use LFIWZX.
	def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
	(v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C),
	(XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>;
	def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
	(VMRGEW MrgFP.AC, MrgFP.BD)>;
	def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
	DblToFlt.B0, DblToFlt.B1)),
	(v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;

	// Convert 4 doubles to a vector of ints.
	def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
	DblToInt.C, DblToInt.D)),
	(v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>;
	def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
	DblToUInt.C, DblToUInt.D)),
	(v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>;
	def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
	ExtDbl.B0S, ExtDbl.B1S)),
	(v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>;
	def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
	ExtDbl.B0U, ExtDbl.B1U)),
	(v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>;
	}

	let Predicates = [HasDirectMove] in {
	// Endianness-neutral constant splat on P8 and newer targets. The reason
	// for this pattern is that on targets with direct moves, we don't expand
	// BUILD_VECTOR nodes for v4i32.
	def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
	immSExt5NonZero:$A, immSExt5NonZero:$A)),
	(v4i32 (VSPLTISW imm:$A))>;
	}

	let Predicates = [IsBigEndian, HasDirectMove, NoP9Vector] in {
	// Big endian integer vectors using direct moves.
	def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
	(v2i64 (XXPERMDI
	(COPY_TO_REGCLASS (MTVSRD $A), VSRC),
	(COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
	(XXPERMDI
	(COPY_TO_REGCLASS
	(MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), VSRC),
	(COPY_TO_REGCLASS
	(MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
	(XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
	}

	let Predicates = [IsLittleEndian, HasDirectMove, NoP9Vector] in {
	// Little endian integer vectors using direct moves.
	def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
	(v2i64 (XXPERMDI
	(COPY_TO_REGCLASS (MTVSRD $B), VSRC),
	(COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
	(XXPERMDI
	(COPY_TO_REGCLASS
	(MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), VSRC),
	(COPY_TO_REGCLASS
	(MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), VSRC), 0)>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
	(XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
	}

	let Predicates = [HasP8Vector] in {
	def : Pat<(v1i128 (bitconvert (v16i8 immAllOnesV))),
	(v1i128 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
	def : Pat<(v2i64 (bitconvert (v16i8 immAllOnesV))),
	(v2i64 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
	def : Pat<(v8i16 (bitconvert (v16i8 immAllOnesV))),
	(v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
	def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
	(v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
	}

	let Predicates = [HasP9Vector] in {
	// Endianness-neutral patterns for const splats with ISA 3.0 instructions.
	def : Pat<(v4i32 (scalar_to_vector i32:$A)),
	(v4i32 (MTVSRWS $A))>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
	(v4i32 (MTVSRWS $A))>;
	def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
	immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
	(v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
	def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
	(v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
	def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
	(v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
	def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS
	(XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>;
	def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
	(v4i32 (XXSPLTW (COPY_TO_REGCLASS
	(XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>;
	def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
	(v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
	(DFLOADf32 iaddrX4:$A),
	VSFRC)), 0))>;
	def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
	(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
	(DFLOADf32 iaddrX4:$A),
	VSFRC)), 0))>;
	def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
	(v4f32 (LXVWSX xoaddr:$A))>;
	def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
	(v4i32 (LXVWSX xoaddr:$A))>;
	}

	let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
	def : Pat<(i64 (extractelt v2i64:$A, 1)),
	(i64 (MFVSRLD $A))>;
	// Better way to build integer vectors if we have MTVSRDD. Big endian.
	def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
	(v2i64 (MTVSRDD $rB, $rA))>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
	(MTVSRDD
	(RLDIMI AnyExts.B, AnyExts.A, 32, 0),
	(RLDIMI AnyExts.D, AnyExts.C, 32, 0))>;
	}

	let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
	def : Pat<(i64 (extractelt v2i64:$A, 0)),
	(i64 (MFVSRLD $A))>;
	// Better way to build integer vectors if we have MTVSRDD. Little endian.
	def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
	(v2i64 (MTVSRDD $rB, $rA))>;
	def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
	(MTVSRDD
	(RLDIMI AnyExts.C, AnyExts.D, 32, 0),
	(RLDIMI AnyExts.A, AnyExts.B, 32, 0))>;
	}
	// P9 Altivec instructions that can be used to build vectors.
	// Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
	// with complexities of existing build vector patterns in this file.
	let Predicates = [HasP9Altivec, IsLittleEndian] in {
	def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)),
	(v2i64 (VEXTSW2D $A))>;
	def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)),
	(v2i64 (VEXTSH2D $A))>;
	def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1,
	HWordToWord.LE_A2, HWordToWord.LE_A3)),
	(v4i32 (VEXTSH2W $A))>;
	def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1,
	ByteToWord.LE_A2, ByteToWord.LE_A3)),
	(v4i32 (VEXTSB2W $A))>;
	def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
	(v2i64 (VEXTSB2D $A))>;
	}

	let Predicates = [HasP9Altivec, IsBigEndian] in {
	def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)),
	(v2i64 (VEXTSW2D $A))>;
	def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)),
	(v2i64 (VEXTSH2D $A))>;
	def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1,
	HWordToWord.BE_A2, HWordToWord.BE_A3)),
	(v4i32 (VEXTSH2W $A))>;
	def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
	ByteToWord.BE_A2, ByteToWord.BE_A3)),
	(v4i32 (VEXTSB2W $A))>;
	def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
	(v2i64 (VEXTSB2D $A))>;
	}

	let Predicates = [HasP9Altivec] in {
	def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)),
	(v2i64 (VEXTSB2D $A))>;
	def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)),
	(v2i64 (VEXTSH2D $A))>;
	def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)),
	(v2i64 (VEXTSW2D $A))>;
	def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)),
	(v4i32 (VEXTSB2W $A))>;
	def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)),
	(v4i32 (VEXTSH2W $A))>;
	}
	}

	// Put this P9Altivec related definition here since it's possible to be
	// selected to VSX instruction xvnegsp, avoid possible undef.
	let Predicates = [HasP9Altivec] in {

	def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))),
	(v4i32 (VABSDUW $A, $B))>;

	def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))),
	(v8i16 (VABSDUH $A, $B))>;

	def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))),
	(v16i8 (VABSDUB $A, $B))>;

	// As PPCVABSD description, the last operand indicates whether do the
	// sign bit flip.
	def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
	(v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h (revision 362609)
	@@ -1,393 +1,395 @@
	//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------- C++ ---===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the PowerPC specific subclass of TargetSubtargetInfo.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
	#define LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H

	#include "PPCFrameLowering.h"
	#include "PPCISelLowering.h"
	#include "PPCInstrInfo.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/MC/MCInstrItineraries.h"
	#include <string>

	#define GET_SUBTARGETINFO_HEADER
	#include "PPCGenSubtargetInfo.inc"

	// GCC #defines PPC on Linux but we use it as our namespace name
	#undef PPC

	namespace llvm {
	class StringRef;

	namespace PPC {
	// -m directive values.
	enum {
	DIR_NONE,
	DIR_32,
	DIR_440,
	DIR_601,
	DIR_602,
	DIR_603,
	DIR_7400,
	DIR_750,
	DIR_970,
	DIR_A2,
	DIR_E500,
	DIR_E500mc,
	DIR_E5500,
	DIR_PWR3,
	DIR_PWR4,
	DIR_PWR5,
	DIR_PWR5X,
	DIR_PWR6,
	DIR_PWR6X,
	DIR_PWR7,
	DIR_PWR8,
	DIR_PWR9,
	DIR_PWR_FUTURE,
	DIR_64
	};
	}

	class GlobalValue;
	class TargetMachine;

	class PPCSubtarget : public PPCGenSubtargetInfo {
	public:
	enum POPCNTDKind {
	POPCNTD_Unavailable,
	POPCNTD_Slow,
	POPCNTD_Fast
	};

	protected:
	/// TargetTriple - What processor and OS we're targeting.
	Triple TargetTriple;

	/// stackAlignment - The minimum alignment known to hold of the stack frame on
	/// entry to the function and which must be maintained by every function.
	Align StackAlignment;

	/// Selected instruction itineraries (one entry per itinerary class.)
	InstrItineraryData InstrItins;

	/// Which cpu directive was used.
	unsigned CPUDirective;

	/// Used by the ISel to turn in optimizations for POWER4-derived architectures
	bool HasMFOCRF;
	bool Has64BitSupport;
	bool Use64BitRegs;
	bool UseCRBits;
	bool HasHardFloat;
	bool IsPPC64;
	bool HasAltivec;
	bool HasFPU;
	bool HasSPE;
	bool HasQPX;
	bool HasVSX;
	bool NeedsTwoConstNR;
	bool HasP8Vector;
	bool HasP8Altivec;
	bool HasP8Crypto;
	bool HasP9Vector;
	bool HasP9Altivec;
	bool HasFCPSGN;
	bool HasFSQRT;
	bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
	bool HasRecipPrec;
	bool HasSTFIWX;
	bool HasLFIWAX;
	bool HasFPRND;
	bool HasFPCVT;
	bool HasISEL;
	bool HasBPERMD;
	bool HasExtDiv;
	bool HasCMPB;
	bool HasLDBRX;
	bool IsBookE;
	bool HasOnlyMSYNC;
	bool IsE500;
	bool IsPPC4xx;
	bool IsPPC6xx;
	bool FeatureMFTB;
	+ bool AllowsUnalignedFPAccess;
	bool DeprecatedDST;
	bool HasLazyResolverStubs;
	bool IsLittleEndian;
	bool HasICBT;
	bool HasInvariantFunctionDescriptors;
	bool HasPartwordAtomics;
	bool HasDirectMove;
	bool HasHTM;
	bool HasFloat128;
	bool IsISA3_0;
	bool UseLongCalls;
	bool SecurePlt;
	bool VectorsUseTwoUnits;
	bool UsePPCPreRASchedStrategy;
	bool UsePPCPostRASchedStrategy;

	POPCNTDKind HasPOPCNTD;

	/// When targeting QPX running a stock PPC64 Linux kernel where the stack
	/// alignment has not been changed, we need to keep the 16-byte alignment
	/// of the stack.
	bool IsQPXStackUnaligned;

	const PPCTargetMachine &TM;
	PPCFrameLowering FrameLowering;
	PPCInstrInfo InstrInfo;
	PPCTargetLowering TLInfo;
	SelectionDAGTargetInfo TSInfo;

	public:
	/// This constructor initializes the data members to match that
	/// of the specified triple.
	///
	PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
	const PPCTargetMachine &TM);

	/// ParseSubtargetFeatures - Parses features string setting specified
	/// subtarget options. Definition of function is auto generated by tblgen.
	void ParseSubtargetFeatures(StringRef CPU, StringRef FS);

	/// getStackAlignment - Returns the minimum alignment known to hold of the
	/// stack frame on entry to the function and which must be maintained by every
	/// function for this subtarget.
	Align getStackAlignment() const { return StackAlignment; }

	/// getDarwinDirective - Returns the -m directive specified for the cpu.
	unsigned getDarwinDirective() const { return CPUDirective; }

	/// getCPUDirective - Returns the -m directive specified for the cpu.
	///
	unsigned getCPUDirective() const { return CPUDirective; }

	/// getInstrItins - Return the instruction itineraries based on subtarget
	/// selection.
	const InstrItineraryData *getInstrItineraryData() const override {
	return &InstrItins;
	}

	const PPCFrameLowering *getFrameLowering() const override {
	return &FrameLowering;
	}
	const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
	const PPCTargetLowering *getTargetLowering() const override {
	return &TLInfo;
	}
	const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
	return &TSInfo;
	}
	const PPCRegisterInfo *getRegisterInfo() const override {
	return &getInstrInfo()->getRegisterInfo();
	}
	const PPCTargetMachine &getTargetMachine() const { return TM; }

	/// initializeSubtargetDependencies - Initializes using a CPU and feature string
	/// so that we can use initializer lists for subtarget initialization.
	PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);

	private:
	void initializeEnvironment();
	void initSubtargetFeatures(StringRef CPU, StringRef FS);

	public:
	/// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
	///
	bool isPPC64() const;

	/// has64BitSupport - Return true if the selected CPU supports 64-bit
	/// instructions, regardless of whether we are in 32-bit or 64-bit mode.
	bool has64BitSupport() const { return Has64BitSupport; }
	// useSoftFloat - Return true if soft-float option is turned on.
	bool useSoftFloat() const {
	if (isAIXABI() && !HasHardFloat)
	report_fatal_error("soft-float is not yet supported on AIX.");
	return !HasHardFloat;
	}

	/// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
	/// registers in 32-bit mode when possible. This can only true if
	/// has64BitSupport() returns true.
	bool use64BitRegs() const { return Use64BitRegs; }

	/// useCRBits - Return true if we should store and manipulate i1 values in
	/// the individual condition register bits.
	bool useCRBits() const { return UseCRBits; }

	/// hasLazyResolverStub - Return true if accesses to the specified global have
	/// to go through a dyld lazy resolution stub. This means that an extra load
	/// is required to get the address of the global.
	bool hasLazyResolverStub(const GlobalValue *GV) const;

	// isLittleEndian - True if generating little-endian code
	bool isLittleEndian() const { return IsLittleEndian; }

	// Specific obvious features.
	bool hasFCPSGN() const { return HasFCPSGN; }
	bool hasFSQRT() const { return HasFSQRT; }
	bool hasFRE() const { return HasFRE; }
	bool hasFRES() const { return HasFRES; }
	bool hasFRSQRTE() const { return HasFRSQRTE; }
	bool hasFRSQRTES() const { return HasFRSQRTES; }
	bool hasRecipPrec() const { return HasRecipPrec; }
	bool hasSTFIWX() const { return HasSTFIWX; }
	bool hasLFIWAX() const { return HasLFIWAX; }
	bool hasFPRND() const { return HasFPRND; }
	bool hasFPCVT() const { return HasFPCVT; }
	bool hasAltivec() const { return HasAltivec; }
	bool hasSPE() const { return HasSPE; }
	bool hasFPU() const { return HasFPU; }
	bool hasQPX() const { return HasQPX; }
	bool hasVSX() const { return HasVSX; }
	bool needsTwoConstNR() const { return NeedsTwoConstNR; }
	bool hasP8Vector() const { return HasP8Vector; }
	bool hasP8Altivec() const { return HasP8Altivec; }
	bool hasP8Crypto() const { return HasP8Crypto; }
	bool hasP9Vector() const { return HasP9Vector; }
	bool hasP9Altivec() const { return HasP9Altivec; }
	bool hasMFOCRF() const { return HasMFOCRF; }
	bool hasISEL() const { return HasISEL; }
	bool hasBPERMD() const { return HasBPERMD; }
	bool hasExtDiv() const { return HasExtDiv; }
	bool hasCMPB() const { return HasCMPB; }
	bool hasLDBRX() const { return HasLDBRX; }
	bool isBookE() const { return IsBookE; }
	bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
	bool isPPC4xx() const { return IsPPC4xx; }
	bool isPPC6xx() const { return IsPPC6xx; }
	bool isSecurePlt() const {return SecurePlt; }
	bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; }
	bool isE500() const { return IsE500; }
	bool isFeatureMFTB() const { return FeatureMFTB; }
	+ bool allowsUnalignedFPAccess() const { return AllowsUnalignedFPAccess; }
	bool isDeprecatedDST() const { return DeprecatedDST; }
	bool hasICBT() const { return HasICBT; }
	bool hasInvariantFunctionDescriptors() const {
	return HasInvariantFunctionDescriptors;
	}
	bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; }
	bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; }
	bool hasPartwordAtomics() const { return HasPartwordAtomics; }
	bool hasDirectMove() const { return HasDirectMove; }

	bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
	Align getPlatformStackAlignment() const {
	if ((hasQPX() \|\| isBGQ()) && !isQPXStackUnaligned())
	return Align(32);

	return Align(16);
	}

	// DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no
	// red zone and PPC64 SVR4ABI has a 288-byte red zone.
	unsigned getRedZoneSize() const {
	return isDarwinABI() ? 224 : (isPPC64() ? 288 : 0);
	}

	bool hasHTM() const { return HasHTM; }
	bool hasFloat128() const { return HasFloat128; }
	bool isISA3_0() const { return IsISA3_0; }
	bool useLongCalls() const { return UseLongCalls; }
	bool needsSwapsForVSXMemOps() const {
	return hasVSX() && isLittleEndian() && !hasP9Vector();
	}

	POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }

	const Triple &getTargetTriple() const { return TargetTriple; }

	/// isDarwin - True if this is any darwin platform.
	bool isDarwin() const { return TargetTriple.isMacOSX(); }
	/// isBGQ - True if this is a BG/Q platform.
	bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }

	bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
	bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
	bool isTargetLinux() const { return TargetTriple.isOSLinux(); }

	bool isDarwinABI() const { return isTargetMachO() \|\| isDarwin(); }
	bool isAIXABI() const { return TargetTriple.isOSAIX(); }
	bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); }
	bool isELFv2ABI() const;

	bool is64BitELFABI() const { return isSVR4ABI() && isPPC64(); }
	bool is32BitELFABI() const { return isSVR4ABI() && !isPPC64(); }

	/// Originally, this function return hasISEL(). Now we always enable it,
	/// but may expand the ISEL instruction later.
	bool enableEarlyIfConversion() const override { return true; }

	/// Scheduling customization.
	bool enableMachineScheduler() const override;
	/// Pipeliner customization.
	bool enableMachinePipeliner() const override;
	/// Machine Pipeliner customization
	bool useDFAforSMS() const override;
	/// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
	bool enablePostRAScheduler() const override;
	AntiDepBreakMode getAntiDepBreakMode() const override;
	void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;

	void overrideSchedPolicy(MachineSchedPolicy &Policy,
	unsigned NumRegionInstrs) const override;
	bool useAA() const override;

	bool enableSubRegLiveness() const override;

	/// True if the GV will be accessed via an indirect symbol.
	bool isGVIndirectSymbol(const GlobalValue *GV) const;

	/// True if the ABI is descriptor based.
	bool usesFunctionDescriptors() const {
	// Both 32-bit and 64-bit AIX are descriptor based. For ELF only the 64-bit
	// v1 ABI uses descriptors.
	return isAIXABI() \|\| (is64BitELFABI() && !isELFv2ABI());
	}

	unsigned descriptorTOCAnchorOffset() const {
	assert(usesFunctionDescriptors() &&
	"Should only be called when the target uses descriptors.");
	return IsPPC64 ? 8 : 4;
	}

	unsigned descriptorEnvironmentPointerOffset() const {
	assert(usesFunctionDescriptors() &&
	"Should only be called when the target uses descriptors.");
	return IsPPC64 ? 16 : 8;
	}

	MCRegister getEnvironmentPointerRegister() const {
	assert(usesFunctionDescriptors() &&
	"Should only be called when the target uses descriptors.");
	return IsPPC64 ? PPC::X11 : PPC::R11;
	}

	MCRegister getTOCPointerRegister() const {
	assert((is64BitELFABI() \|\| isAIXABI()) &&
	"Should only be called when the target is a TOC based ABI.");
	return IsPPC64 ? PPC::X2 : PPC::R2;
	}

	MCRegister getStackPointerRegister() const {
	return IsPPC64 ? PPC::X1 : PPC::R1;
	}

	bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
	};
	} // End llvm namespace

	#endif
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86RetpolineThunks.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86RetpolineThunks.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86RetpolineThunks.cpp (nonexistent)
	@@ -1,286 +0,0 @@
	-//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====//
	-//
	-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	-// See https://llvm.org/LICENSE.txt for license information.
	-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	-//
	-//===----------------------------------------------------------------------===//
	-/// \file
	-///
	-/// Pass that injects an MI thunk implementing a "retpoline". This is
	-/// a RET-implemented trampoline that is used to lower indirect calls in a way
	-/// that prevents speculation on some x86 processors and can be used to mitigate
	-/// security vulnerabilities due to targeted speculative execution and side
	-/// channels such as CVE-2017-5715.
	-///
	-/// TODO(chandlerc): All of this code could use better comments and
	-/// documentation.
	-///
	-//===----------------------------------------------------------------------===//
	-
	-#include "X86.h"
	-#include "X86InstrBuilder.h"
	-#include "X86Subtarget.h"
	-#include "llvm/CodeGen/MachineFunction.h"
	-#include "llvm/CodeGen/MachineInstrBuilder.h"
	-#include "llvm/CodeGen/MachineModuleInfo.h"
	-#include "llvm/CodeGen/Passes.h"
	-#include "llvm/CodeGen/TargetPassConfig.h"
	-#include "llvm/IR/IRBuilder.h"
	-#include "llvm/IR/Instructions.h"
	-#include "llvm/IR/Module.h"
	-#include "llvm/Support/CommandLine.h"
	-#include "llvm/Support/Debug.h"
	-#include "llvm/Support/raw_ostream.h"
	-
	-using namespace llvm;
	-
	-#define DEBUG_TYPE "x86-retpoline-thunks"
	-
	-static const char ThunkNamePrefix[] = "__llvm_retpoline_";
	-static const char R11ThunkName[] = "__llvm_retpoline_r11";
	-static const char EAXThunkName[] = "__llvm_retpoline_eax";
	-static const char ECXThunkName[] = "__llvm_retpoline_ecx";
	-static const char EDXThunkName[] = "__llvm_retpoline_edx";
	-static const char EDIThunkName[] = "__llvm_retpoline_edi";
	-
	-namespace {
	-class X86RetpolineThunks : public MachineFunctionPass {
	-public:
	- static char ID;
	-
	- X86RetpolineThunks() : MachineFunctionPass(ID) {}
	-
	- StringRef getPassName() const override { return "X86 Retpoline Thunks"; }
	-
	- bool doInitialization(Module &M) override;
	- bool runOnMachineFunction(MachineFunction &F) override;
	-
	- void getAnalysisUsage(AnalysisUsage &AU) const override {
	- MachineFunctionPass::getAnalysisUsage(AU);
	- AU.addRequired<MachineModuleInfoWrapperPass>();
	- AU.addPreserved<MachineModuleInfoWrapperPass>();
	- }
	-
	-private:
	- MachineModuleInfo *MMI = nullptr;
	- const TargetMachine *TM = nullptr;
	- bool Is64Bit = false;
	- const X86Subtarget *STI = nullptr;
	- const X86InstrInfo *TII = nullptr;
	-
	- bool InsertedThunks = false;
	-
	- void createThunkFunction(Module &M, StringRef Name);
	- void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
	- void populateThunk(MachineFunction &MF, unsigned Reg);
	-};
	-
	-} // end anonymous namespace
	-
	-FunctionPass *llvm::createX86RetpolineThunksPass() {
	- return new X86RetpolineThunks();
	-}
	-
	-char X86RetpolineThunks::ID = 0;
	-
	-bool X86RetpolineThunks::doInitialization(Module &M) {
	- InsertedThunks = false;
	- return false;
	-}
	-
	-bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
	- LLVM_DEBUG(dbgs() << getPassName() << '\n');
	-
	- TM = &MF.getTarget();;
	- STI = &MF.getSubtarget<X86Subtarget>();
	- TII = STI->getInstrInfo();
	- Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
	-
	- MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
	- Module &M = const_cast<Module &>(*MMI->getModule());
	-
	- // If this function is not a thunk, check to see if we need to insert
	- // a thunk.
	- if (!MF.getName().startswith(ThunkNamePrefix)) {
	- // If we've already inserted a thunk, nothing else to do.
	- if (InsertedThunks)
	- return false;
	-
	- // Only add a thunk if one of the functions has the retpoline feature
	- // enabled in its subtarget, and doesn't enable external thunks.
	- // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
	- // nothing will end up calling it.
	- // FIXME: It's a little silly to look at every function just to enumerate
	- // the subtargets, but eventually we'll want to look at them for indirect
	- // calls, so maybe this is OK.
	- if ((!STI->useRetpolineIndirectCalls() &&
	- !STI->useRetpolineIndirectBranches()) \|\|
	- STI->useRetpolineExternalThunk())
	- return false;
	-
	- // Otherwise, we need to insert the thunk.
	- // WARNING: This is not really a well behaving thing to do in a function
	- // pass. We extract the module and insert a new function (and machine
	- // function) directly into the module.
	- if (Is64Bit)
	- createThunkFunction(M, R11ThunkName);
	- else
	- for (StringRef Name :
	- {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
	- createThunkFunction(M, Name);
	- InsertedThunks = true;
	- return true;
	- }
	-
	- // If this is a thunk function, we need to populate it with the correct MI.
	- if (Is64Bit) {
	- assert(MF.getName() == "__llvm_retpoline_r11" &&
	- "Should only have an r11 thunk on 64-bit targets");
	-
	- // __llvm_retpoline_r11:
	- // callq .Lr11_call_target
	- // .Lr11_capture_spec:
	- // pause
	- // lfence
	- // jmp .Lr11_capture_spec
	- // .align 16
	- // .Lr11_call_target:
	- // movq %r11, (%rsp)
	- // retq
	- populateThunk(MF, X86::R11);
	- } else {
	- // For 32-bit targets we need to emit a collection of thunks for various
	- // possible scratch registers as well as a fallback that uses EDI, which is
	- // normally callee saved.
	- // __llvm_retpoline_eax:
	- // calll .Leax_call_target
	- // .Leax_capture_spec:
	- // pause
	- // jmp .Leax_capture_spec
	- // .align 16
	- // .Leax_call_target:
	- // movl %eax, (%esp) # Clobber return addr
	- // retl
	- //
	- // __llvm_retpoline_ecx:
	- // ... # Same setup
	- // movl %ecx, (%esp)
	- // retl
	- //
	- // __llvm_retpoline_edx:
	- // ... # Same setup
	- // movl %edx, (%esp)
	- // retl
	- //
	- // __llvm_retpoline_edi:
	- // ... # Same setup
	- // movl %edi, (%esp)
	- // retl
	- if (MF.getName() == EAXThunkName)
	- populateThunk(MF, X86::EAX);
	- else if (MF.getName() == ECXThunkName)
	- populateThunk(MF, X86::ECX);
	- else if (MF.getName() == EDXThunkName)
	- populateThunk(MF, X86::EDX);
	- else if (MF.getName() == EDIThunkName)
	- populateThunk(MF, X86::EDI);
	- else
	- llvm_unreachable("Invalid thunk name on x86-32!");
	- }
	-
	- return true;
	-}
	-
	-void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
	- assert(Name.startswith(ThunkNamePrefix) &&
	- "Created a thunk with an unexpected prefix!");
	-
	- LLVMContext &Ctx = M.getContext();
	- auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
	- Function *F =
	- Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
	- F->setVisibility(GlobalValue::HiddenVisibility);
	- F->setComdat(M.getOrInsertComdat(Name));
	-
	- // Add Attributes so that we don't create a frame, unwind information, or
	- // inline.
	- AttrBuilder B;
	- B.addAttribute(llvm::Attribute::NoUnwind);
	- B.addAttribute(llvm::Attribute::Naked);
	- F->addAttributes(llvm::AttributeList::FunctionIndex, B);
	-
	- // Populate our function a bit so that we can verify.
	- BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
	- IRBuilder<> Builder(Entry);
	-
	- Builder.CreateRetVoid();
	-
	- // MachineFunctions/MachineBasicBlocks aren't created automatically for the
	- // IR-level constructs we already made. Create them and insert them into the
	- // module.
	- MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
	- MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
	-
	- // Insert EntryMBB into MF. It's not in the module until we do this.
	- MF.insert(MF.end(), EntryMBB);
	-}
	-
	-void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
	- unsigned Reg) {
	- const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
	- const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
	- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
	- .addReg(Reg);
	-}
	-
	-void X86RetpolineThunks::populateThunk(MachineFunction &MF,
	- unsigned Reg) {
	- // Set MF properties. We never use vregs...
	- MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
	-
	- // Grab the entry MBB and erase any other blocks. O0 codegen appears to
	- // generate two bbs for the entry block.
	- MachineBasicBlock *Entry = &MF.front();
	- Entry->clear();
	- while (MF.size() > 1)
	- MF.erase(std::next(MF.begin()));
	-
	- MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	- MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	- MCSymbol *TargetSym = MF.getContext().createTempSymbol();
	- MF.push_back(CaptureSpec);
	- MF.push_back(CallTarget);
	-
	- const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
	- const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
	-
	- Entry->addLiveIn(Reg);
	- BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
	-
	- // The MIR verifier thinks that the CALL in the entry block will fall through
	- // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
	- // the successor, but the MIR verifier doesn't know how to cope with that.
	- Entry->addSuccessor(CaptureSpec);
	-
	- // In the capture loop for speculation, we want to stop the processor from
	- // speculating as fast as possible. On Intel processors, the PAUSE instruction
	- // will block speculation without consuming any execution resources. On AMD
	- // processors, the PAUSE instruction is (essentially) a nop, so we also use an
	- // LFENCE instruction which they have advised will stop speculation as well
	- // with minimal resource utilization. We still end the capture with a jump to
	- // form an infinite loop to fully guarantee that no matter what implementation
	- // of the x86 ISA, speculating this code path never escapes.
	- BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
	- BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
	- BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
	- CaptureSpec->setHasAddressTaken();
	- CaptureSpec->addSuccessor(CaptureSpec);
	-
	- CallTarget->addLiveIn(Reg);
	- CallTarget->setHasAddressTaken();
	- CallTarget->setAlignment(Align(16));
	- insertRegReturnAddrClobber(*CallTarget, Reg);
	- CallTarget->back().setPreInstrSymbol(MF, TargetSym);
	- BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
	-}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/X86/X86RetpolineThunks.cpp
	___________________________________________________________________
	Deleted: svn:eol-style
	## -1 +0,0 ##
	-native
	\ No newline at end of property
	Deleted: svn:keywords
	## -1 +0,0 ##
	-FreeBSD=%H
	\ No newline at end of property
	Deleted: svn:mime-type
	## -1 +0,0 ##
	-text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h (revision 362609)
	@@ -0,0 +1,446 @@
	+//==========-- ImmutableGraph.h - A fast DAG implementation ---------=========//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+///
	+/// Description: ImmutableGraph is a fast DAG implementation that cannot be
	+/// modified, except by creating a new ImmutableGraph. ImmutableGraph is
	+/// implemented as two arrays: one containing nodes, and one containing edges.
	+/// The advantages to this implementation are two-fold:
	+/// 1. Iteration and traversal operations benefit from cache locality.
	+/// 2. Operations on sets of nodes/edges are efficient, and representations of
	+/// those sets in memory are compact. For instance, a set of edges is
	+/// implemented as a bit vector, wherein each bit corresponds to one edge in
	+/// the edge array. This implies a lower bound of 64x spatial improvement
	+/// over, e.g., an llvm::DenseSet or llvm::SmallSet. It also means that
	+/// insert/erase/contains operations complete in negligible constant time:
	+/// insert and erase require one load and one store, and contains requires
	+/// just one load.
	+///
	+//===----------------------------------------------------------------------===//
	+
	+#ifndef LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
	+#define LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
	+
	+#include "llvm/ADT/BitVector.h"
	+#include "llvm/ADT/GraphTraits.h"
	+#include "llvm/ADT/STLExtras.h"
	+#include "llvm/Support/raw_ostream.h"
	+#include <algorithm>
	+#include <iterator>
	+#include <utility>
	+#include <vector>
	+
	+namespace llvm {
	+
	+template <typename NodeValueT, typename EdgeValueT> class ImmutableGraph {
	+ using Traits = GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *>;
	+ template <typename> friend class ImmutableGraphBuilder;
	+
	+public:
	+ using node_value_type = NodeValueT;
	+ using edge_value_type = EdgeValueT;
	+ using size_type = int;
	+ class Node;
	+ class Edge {
	+ friend class ImmutableGraph;
	+ template <typename> friend class ImmutableGraphBuilder;
	+
	+ const Node *Dest;
	+ edge_value_type Value;
	+
	+ public:
	+ const Node *getDest() const { return Dest; };
	+ const edge_value_type &getValue() const { return Value; }
	+ };
	+ class Node {
	+ friend class ImmutableGraph;
	+ template <typename> friend class ImmutableGraphBuilder;
	+
	+ const Edge *Edges;
	+ node_value_type Value;
	+
	+ public:
	+ const node_value_type &getValue() const { return Value; }
	+
	+ const Edge *edges_begin() const { return Edges; }
	+ // Nodes are allocated sequentially. Edges for a node are stored together.
	+ // The end of this Node's edges is the beginning of the next node's edges.
	+ // An extra node was allocated to hold the end pointer for the last real
	+ // node.
	+ const Edge *edges_end() const { return (this + 1)->Edges; }
	+ ArrayRef<Edge> edges() const {
	+ return makeArrayRef(edges_begin(), edges_end());
	+ }
	+ };
	+
	+protected:
	+ ImmutableGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges,
	+ size_type NodesSize, size_type EdgesSize)
	+ : Nodes(std::move(Nodes)), Edges(std::move(Edges)), NodesSize(NodesSize),
	+ EdgesSize(EdgesSize) {}
	+ ImmutableGraph(const ImmutableGraph &) = delete;
	+ ImmutableGraph(ImmutableGraph &&) = delete;
	+ ImmutableGraph &operator=(const ImmutableGraph &) = delete;
	+ ImmutableGraph &operator=(ImmutableGraph &&) = delete;
	+
	+public:
	+ ArrayRef<Node> nodes() const { return makeArrayRef(Nodes.get(), NodesSize); }
	+ const Node *nodes_begin() const { return nodes().begin(); }
	+ const Node *nodes_end() const { return nodes().end(); }
	+
	+ ArrayRef<Edge> edges() const { return makeArrayRef(Edges.get(), EdgesSize); }
	+ const Edge *edges_begin() const { return edges().begin(); }
	+ const Edge *edges_end() const { return edges().end(); }
	+
	+ size_type nodes_size() const { return NodesSize; }
	+ size_type edges_size() const { return EdgesSize; }
	+
	+ // Node N must belong to this ImmutableGraph.
	+ size_type getNodeIndex(const Node &N) const {
	+ return std::distance(nodes_begin(), &N);
	+ }
	+ // Edge E must belong to this ImmutableGraph.
	+ size_type getEdgeIndex(const Edge &E) const {
	+ return std::distance(edges_begin(), &E);
	+ }
	+
	+ // FIXME: Could NodeSet and EdgeSet be templated to share code?
	+ class NodeSet {
	+ const ImmutableGraph &G;
	+ BitVector V;
	+
	+ public:
	+ NodeSet(const ImmutableGraph &G, bool ContainsAll = false)
	+ : G{G}, V{static_cast<unsigned>(G.nodes_size()), ContainsAll} {}
	+ bool insert(const Node &N) {
	+ size_type Idx = G.getNodeIndex(N);
	+ bool AlreadyExists = V.test(Idx);
	+ V.set(Idx);
	+ return !AlreadyExists;
	+ }
	+ void erase(const Node &N) {
	+ size_type Idx = G.getNodeIndex(N);
	+ V.reset(Idx);
	+ }
	+ bool contains(const Node &N) const {
	+ size_type Idx = G.getNodeIndex(N);
	+ return V.test(Idx);
	+ }
	+ void clear() { V.reset(); }
	+ size_type empty() const { return V.none(); }
	+ /// Return the number of elements in the set
	+ size_type count() const { return V.count(); }
	+ /// Return the size of the set's domain
	+ size_type size() const { return V.size(); }
	+ /// Set union
	+ NodeSet &operator\|=(const NodeSet &RHS) {
	+ assert(&this->G == &RHS.G);
	+ V \|= RHS.V;
	+ return *this;
	+ }
	+ /// Set intersection
	+ NodeSet &operator&=(const NodeSet &RHS) {
	+ assert(&this->G == &RHS.G);
	+ V &= RHS.V;
	+ return *this;
	+ }
	+ /// Set disjoint union
	+ NodeSet &operator^=(const NodeSet &RHS) {
	+ assert(&this->G == &RHS.G);
	+ V ^= RHS.V;
	+ return *this;
	+ }
	+
	+ using index_iterator = typename BitVector::const_set_bits_iterator;
	+ index_iterator index_begin() const { return V.set_bits_begin(); }
	+ index_iterator index_end() const { return V.set_bits_end(); }
	+ void set(size_type Idx) { V.set(Idx); }
	+ void reset(size_type Idx) { V.reset(Idx); }
	+
	+ class iterator {
	+ const NodeSet &Set;
	+ size_type Current;
	+
	+ void advance() {
	+ assert(Current != -1);
	+ Current = Set.V.find_next(Current);
	+ }
	+
	+ public:
	+ iterator(const NodeSet &Set, size_type Begin)
	+ : Set{Set}, Current{Begin} {}
	+ iterator operator++(int) {
	+ iterator Tmp = *this;
	+ advance();
	+ return Tmp;
	+ }
	+ iterator &operator++() {
	+ advance();
	+ return *this;
	+ }
	+ Node operator() const {
	+ assert(Current != -1);
	+ return Set.G.nodes_begin() + Current;
	+ }
	+ bool operator==(const iterator &other) const {
	+ assert(&this->Set == &other.Set);
	+ return this->Current == other.Current;
	+ }
	+ bool operator!=(const iterator &other) const { return !(*this == other); }
	+ };
	+
	+ iterator begin() const { return iterator{*this, V.find_first()}; }
	+ iterator end() const { return iterator{*this, -1}; }
	+ };
	+
	+ class EdgeSet {
	+ const ImmutableGraph &G;
	+ BitVector V;
	+
	+ public:
	+ EdgeSet(const ImmutableGraph &G, bool ContainsAll = false)
	+ : G{G}, V{static_cast<unsigned>(G.edges_size()), ContainsAll} {}
	+ bool insert(const Edge &E) {
	+ size_type Idx = G.getEdgeIndex(E);
	+ bool AlreadyExists = V.test(Idx);
	+ V.set(Idx);
	+ return !AlreadyExists;
	+ }
	+ void erase(const Edge &E) {
	+ size_type Idx = G.getEdgeIndex(E);
	+ V.reset(Idx);
	+ }
	+ bool contains(const Edge &E) const {
	+ size_type Idx = G.getEdgeIndex(E);
	+ return V.test(Idx);
	+ }
	+ void clear() { V.reset(); }
	+ bool empty() const { return V.none(); }
	+ /// Return the number of elements in the set
	+ size_type count() const { return V.count(); }
	+ /// Return the size of the set's domain
	+ size_type size() const { return V.size(); }
	+ /// Set union
	+ EdgeSet &operator\|=(const EdgeSet &RHS) {
	+ assert(&this->G == &RHS.G);
	+ V \|= RHS.V;
	+ return *this;
	+ }
	+ /// Set intersection
	+ EdgeSet &operator&=(const EdgeSet &RHS) {
	+ assert(&this->G == &RHS.G);
	+ V &= RHS.V;
	+ return *this;
	+ }
	+ /// Set disjoint union
	+ EdgeSet &operator^=(const EdgeSet &RHS) {
	+ assert(&this->G == &RHS.G);
	+ V ^= RHS.V;
	+ return *this;
	+ }
	+
	+ using index_iterator = typename BitVector::const_set_bits_iterator;
	+ index_iterator index_begin() const { return V.set_bits_begin(); }
	+ index_iterator index_end() const { return V.set_bits_end(); }
	+ void set(size_type Idx) { V.set(Idx); }
	+ void reset(size_type Idx) { V.reset(Idx); }
	+
	+ class iterator {
	+ const EdgeSet &Set;
	+ size_type Current;
	+
	+ void advance() {
	+ assert(Current != -1);
	+ Current = Set.V.find_next(Current);
	+ }
	+
	+ public:
	+ iterator(const EdgeSet &Set, size_type Begin)
	+ : Set{Set}, Current{Begin} {}
	+ iterator operator++(int) {
	+ iterator Tmp = *this;
	+ advance();
	+ return Tmp;
	+ }
	+ iterator &operator++() {
	+ advance();
	+ return *this;
	+ }
	+ Edge operator() const {
	+ assert(Current != -1);
	+ return Set.G.edges_begin() + Current;
	+ }
	+ bool operator==(const iterator &other) const {
	+ assert(&this->Set == &other.Set);
	+ return this->Current == other.Current;
	+ }
	+ bool operator!=(const iterator &other) const { return !(*this == other); }
	+ };
	+
	+ iterator begin() const { return iterator{*this, V.find_first()}; }
	+ iterator end() const { return iterator{*this, -1}; }
	+ };
	+
	+private:
	+ std::unique_ptr<Node[]> Nodes;
	+ std::unique_ptr<Edge[]> Edges;
	+ size_type NodesSize;
	+ size_type EdgesSize;
	+};
	+
	+template <typename GraphT> class ImmutableGraphBuilder {
	+ using node_value_type = typename GraphT::node_value_type;
	+ using edge_value_type = typename GraphT::edge_value_type;
	+ static_assert(
	+ std::is_base_of<ImmutableGraph<node_value_type, edge_value_type>,
	+ GraphT>::value,
	+ "Template argument to ImmutableGraphBuilder must derive from "
	+ "ImmutableGraph<>");
	+ using size_type = typename GraphT::size_type;
	+ using NodeSet = typename GraphT::NodeSet;
	+ using Node = typename GraphT::Node;
	+ using EdgeSet = typename GraphT::EdgeSet;
	+ using Edge = typename GraphT::Edge;
	+ using BuilderEdge = std::pair<edge_value_type, size_type>;
	+ using EdgeList = std::vector<BuilderEdge>;
	+ using BuilderVertex = std::pair<node_value_type, EdgeList>;
	+ using VertexVec = std::vector<BuilderVertex>;
	+
	+public:
	+ using BuilderNodeRef = size_type;
	+
	+ BuilderNodeRef addVertex(const node_value_type &V) {
	+ auto I = AdjList.emplace(AdjList.end(), V, EdgeList{});
	+ return std::distance(AdjList.begin(), I);
	+ }
	+
	+ void addEdge(const edge_value_type &E, BuilderNodeRef From,
	+ BuilderNodeRef To) {
	+ AdjList[From].second.emplace_back(E, To);
	+ }
	+
	+ bool empty() const { return AdjList.empty(); }
	+
	+ template <typename... ArgT> std::unique_ptr<GraphT> get(ArgT &&... Args) {
	+ size_type VertexSize = AdjList.size(), EdgeSize = 0;
	+ for (const auto &V : AdjList) {
	+ EdgeSize += V.second.size();
	+ }
	+ auto VertexArray =
	+ std::make_unique<Node[]>(VertexSize + 1 /* terminator node */);
	+ auto EdgeArray = std::make_unique<Edge[]>(EdgeSize);
	+ size_type VI = 0, EI = 0;
	+ for (; VI < VertexSize; ++VI) {
	+ VertexArray[VI].Value = std::move(AdjList[VI].first);
	+ VertexArray[VI].Edges = &EdgeArray[EI];
	+ auto NumEdges = static_cast<size_type>(AdjList[VI].second.size());
	+ for (size_type VEI = 0; VEI < NumEdges; ++VEI, ++EI) {
	+ auto &E = AdjList[VI].second[VEI];
	+ EdgeArray[EI].Value = std::move(E.first);
	+ EdgeArray[EI].Dest = &VertexArray[E.second];
	+ }
	+ }
	+ assert(VI == VertexSize && EI == EdgeSize && "ImmutableGraph malformed");
	+ VertexArray[VI].Edges = &EdgeArray[EdgeSize]; // terminator node
	+ return std::make_unique<GraphT>(std::move(VertexArray),
	+ std::move(EdgeArray), VertexSize, EdgeSize,
	+ std::forward<ArgT>(Args)...);
	+ }
	+
	+ template <typename... ArgT>
	+ static std::unique_ptr<GraphT> trim(const GraphT &G, const NodeSet &TrimNodes,
	+ const EdgeSet &TrimEdges,
	+ ArgT &&... Args) {
	+ size_type NewVertexSize = G.nodes_size() - TrimNodes.count();
	+ size_type NewEdgeSize = G.edges_size() - TrimEdges.count();
	+ auto NewVertexArray =
	+ std::make_unique<Node[]>(NewVertexSize + 1 /* terminator node */);
	+ auto NewEdgeArray = std::make_unique<Edge[]>(NewEdgeSize);
	+
	+ // Walk the nodes and determine the new index for each node.
	+ size_type NewNodeIndex = 0;
	+ std::vector<size_type> RemappedNodeIndex(G.nodes_size());
	+ for (const Node &N : G.nodes()) {
	+ if (TrimNodes.contains(N))
	+ continue;
	+ RemappedNodeIndex[G.getNodeIndex(N)] = NewNodeIndex++;
	+ }
	+ assert(NewNodeIndex == NewVertexSize &&
	+ "Should have assigned NewVertexSize indices");
	+
	+ size_type VertexI = 0, EdgeI = 0;
	+ for (const Node &N : G.nodes()) {
	+ if (TrimNodes.contains(N))
	+ continue;
	+ NewVertexArray[VertexI].Value = N.getValue();
	+ NewVertexArray[VertexI].Edges = &NewEdgeArray[EdgeI];
	+ for (const Edge &E : N.edges()) {
	+ if (TrimEdges.contains(E))
	+ continue;
	+ NewEdgeArray[EdgeI].Value = E.getValue();
	+ size_type DestIdx = G.getNodeIndex(*E.getDest());
	+ size_type NewIdx = RemappedNodeIndex[DestIdx];
	+ assert(NewIdx < NewVertexSize);
	+ NewEdgeArray[EdgeI].Dest = &NewVertexArray[NewIdx];
	+ ++EdgeI;
	+ }
	+ ++VertexI;
	+ }
	+ assert(VertexI == NewVertexSize && EdgeI == NewEdgeSize &&
	+ "Gadget graph malformed");
	+ NewVertexArray[VertexI].Edges = &NewEdgeArray[NewEdgeSize]; // terminator
	+ return std::make_unique<GraphT>(std::move(NewVertexArray),
	+ std::move(NewEdgeArray), NewVertexSize,
	+ NewEdgeSize, std::forward<ArgT>(Args)...);
	+ }
	+
	+private:
	+ VertexVec AdjList;
	+};
	+
	+template <typename NodeValueT, typename EdgeValueT>
	+struct GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *> {
	+ using GraphT = ImmutableGraph<NodeValueT, EdgeValueT>;
	+ using NodeRef = typename GraphT::Node const *;
	+ using EdgeRef = typename GraphT::Edge const &;
	+
	+ static NodeRef edge_dest(EdgeRef E) { return E.getDest(); }
	+ using ChildIteratorType =
	+ mapped_iterator<typename GraphT::Edge const *, decltype(&edge_dest)>;
	+
	+ static NodeRef getEntryNode(GraphT *G) { return G->nodes_begin(); }
	+ static ChildIteratorType child_begin(NodeRef N) {
	+ return {N->edges_begin(), &edge_dest};
	+ }
	+ static ChildIteratorType child_end(NodeRef N) {
	+ return {N->edges_end(), &edge_dest};
	+ }
	+
	+ static NodeRef getNode(typename GraphT::Node const &N) { return NodeRef{&N}; }
	+ using nodes_iterator =
	+ mapped_iterator<typename GraphT::Node const *, decltype(&getNode)>;
	+ static nodes_iterator nodes_begin(GraphT *G) {
	+ return {G->nodes_begin(), &getNode};
	+ }
	+ static nodes_iterator nodes_end(GraphT *G) {
	+ return {G->nodes_end(), &getNode};
	+ }
	+
	+ using ChildEdgeIteratorType = typename GraphT::Edge const *;
	+
	+ static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
	+ return N->edges_begin();
	+ }
	+ static ChildEdgeIteratorType child_edge_end(NodeRef N) {
	+ return N->edges_end();
	+ }
	+ static typename GraphT::size_type size(GraphT *G) { return G->nodes_size(); }
	+};
	+
	+} // end namespace llvm
	+
	+#endif // LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86.h (revision 362609)
	@@ -1,167 +1,173 @@
	//===-- X86.h - Top-level interface for X86 representation ------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the entry points for global functions defined in the x86
	// target library, as used by the LLVM JIT.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86_H
	#define LLVM_LIB_TARGET_X86_X86_H

	#include "llvm/Support/CodeGen.h"

	namespace llvm {

	class FunctionPass;
	class ImmutablePass;
	class InstructionSelector;
	class ModulePass;
	class PassRegistry;
	class X86RegisterBankInfo;
	class X86Subtarget;
	class X86TargetMachine;

	/// This pass converts a legalized DAG into a X86-specific DAG, ready for
	/// instruction scheduling.
	FunctionPass *createX86ISelDag(X86TargetMachine &TM,
	CodeGenOpt::Level OptLevel);

	/// This pass initializes a global base register for PIC on x86-32.
	FunctionPass *createX86GlobalBaseRegPass();

	/// This pass combines multiple accesses to local-dynamic TLS variables so that
	/// the TLS base address for the module is only fetched once per execution path
	/// through the function.
	FunctionPass *createCleanupLocalDynamicTLSPass();

	/// This function returns a pass which converts floating-point register
	/// references and pseudo instructions into floating-point stack references and
	/// physical instructions.
	FunctionPass *createX86FloatingPointStackifierPass();

	/// This pass inserts AVX vzeroupper instructions before each call to avoid
	/// transition penalty between functions encoded with AVX and SSE.
	FunctionPass *createX86IssueVZeroUpperPass();

	/// This pass inserts ENDBR instructions before indirect jump/call
	/// destinations as part of CET IBT mechanism.
	FunctionPass *createX86IndirectBranchTrackingPass();

	/// Return a pass that pads short functions with NOOPs.
	/// This will prevent a stall when returning on the Atom.
	FunctionPass *createX86PadShortFunctions();

	/// Return a pass that selectively replaces certain instructions (like add,
	/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
	/// instructions, in order to eliminate execution delays in some processors.
	FunctionPass *createX86FixupLEAs();

	/// Return a pass that removes redundant LEA instructions and redundant address
	/// recalculations.
	FunctionPass *createX86OptimizeLEAs();

	/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
	FunctionPass *createX86FixupSetCC();

	/// Return a pass that folds conditional branch jumps.
	FunctionPass *createX86CondBrFolding();

	/// Return a pass that avoids creating store forward block issues in the hardware.
	FunctionPass *createX86AvoidStoreForwardingBlocks();

	/// Return a pass that lowers EFLAGS copy pseudo instructions.
	FunctionPass *createX86FlagsCopyLoweringPass();

	/// Return a pass that expands WinAlloca pseudo-instructions.
	FunctionPass *createX86WinAllocaExpander();

	/// Return a pass that inserts int3 at the end of the function if it ends with a
	/// CALL instruction. The pass does the same for each funclet as well. This
	/// ensures that the open interval of function start and end PCs contains all
	/// return addresses for the benefit of the Windows x64 unwinder.
	FunctionPass *createX86AvoidTrailingCallPass();

	/// Return a pass that optimizes the code-size of x86 call sequences. This is
	/// done by replacing esp-relative movs with pushes.
	FunctionPass *createX86CallFrameOptimization();

	/// Return an IR pass that inserts EH registration stack objects and explicit
	/// EH state updates. This pass must run after EH preparation, which does
	/// Windows-specific but architecture-neutral preparation.
	FunctionPass *createX86WinEHStatePass();

	/// Return a Machine IR pass that expands X86-specific pseudo
	/// instructions into a sequence of actual instructions. This pass
	/// must run after prologue/epilogue insertion and before lowering
	/// the MachineInstr to MC.
	FunctionPass *createX86ExpandPseudoPass();

	/// This pass converts X86 cmov instructions into branch when profitable.
	FunctionPass *createX86CmovConverterPass();

	/// Return a Machine IR pass that selectively replaces
	/// certain byte and word instructions by equivalent 32 bit instructions,
	/// in order to eliminate partial register usage, false dependences on
	/// the upper portions of registers, and to save code size.
	FunctionPass *createX86FixupBWInsts();

	/// Return a Machine IR pass that reassigns instruction chains from one domain
	/// to another, when profitable.
	FunctionPass *createX86DomainReassignmentPass();

	/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
	/// encoding when possible in order to reduce code size.
	FunctionPass *createX86EvexToVexInsts();

	/// This pass creates the thunks for the retpoline feature.
	-FunctionPass *createX86RetpolineThunksPass();
	+FunctionPass *createX86IndirectThunksPass();

	/// This pass ensures instructions featuring a memory operand
	/// have distinctive <LineNumber, Discriminator> (with respect to eachother)
	FunctionPass *createX86DiscriminateMemOpsPass();

	/// This pass applies profiling information to insert cache prefetches.
	FunctionPass *createX86InsertPrefetchPass();

	InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
	X86Subtarget &,
	X86RegisterBankInfo &);

	+FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
	+FunctionPass *createX86LoadValueInjectionLoadHardeningUnoptimizedPass();
	+FunctionPass *createX86LoadValueInjectionRetHardeningPass();
	FunctionPass *createX86SpeculativeLoadHardeningPass();

	void initializeEvexToVexInstPassPass(PassRegistry &);
	void initializeFixupBWInstPassPass(PassRegistry &);
	void initializeFixupLEAPassPass(PassRegistry &);
	void initializeFPSPass(PassRegistry &);
	void initializeWinEHStatePassPass(PassRegistry &);
	void initializeX86AvoidSFBPassPass(PassRegistry &);
	void initializeX86CallFrameOptimizationPass(PassRegistry &);
	void initializeX86CmovConverterPassPass(PassRegistry &);
	void initializeX86CondBrFoldingPassPass(PassRegistry &);
	void initializeX86DomainReassignmentPass(PassRegistry &);
	void initializeX86ExecutionDomainFixPass(PassRegistry &);
	void initializeX86ExpandPseudoPass(PassRegistry &);
	void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
	+void initializeX86LoadValueInjectionLoadHardeningUnoptimizedPassPass(PassRegistry &);
	+void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
	+void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
	void initializeX86OptimizeLEAPassPass(PassRegistry &);
	void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);

	namespace X86AS {
	enum : unsigned {
	GS = 256,
	FS = 257,
	SS = 258,
	PTR32_SPTR = 270,
	PTR32_UPTR = 271,
	PTR64 = 272
	};
	} // End X86AS namespace

	} // End llvm namespace

	#endif
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86.td (revision 362609)
	@@ -1,1312 +1,1328 @@
	//===-- X86.td - Target definition file for the Intel X86 --- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This is a target description file for the Intel i386 architecture, referred
	// to here as the "X86" architecture.
	//
	//===----------------------------------------------------------------------===//

	// Get the target-independent interfaces which we are implementing...
	//
	include "llvm/Target/Target.td"

	//===----------------------------------------------------------------------===//
	// X86 Subtarget state
	//

	def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
	"64-bit mode (x86_64)">;
	def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
	"32-bit mode (80386)">;
	def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
	"16-bit mode (i8086)">;

	//===----------------------------------------------------------------------===//
	// X86 Subtarget features
	//===----------------------------------------------------------------------===//

	def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
	"Enable X87 float instructions">;

	def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
	"Enable NOPL instruction">;

	def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
	"Enable conditional move instructions">;

	def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
	"Support CMPXCHG8B instructions">;

	def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
	"Support POPCNT instruction">;

	def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
	"Support fxsave/fxrestore instructions">;

	def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
	"Support xsave instructions">;

	def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
	"Support xsaveopt instructions">;

	def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
	"Support xsavec instructions">;

	def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
	"Support xsaves instructions">;

	def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
	"Enable SSE instructions">;
	def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
	"Enable SSE2 instructions",
	[FeatureSSE1]>;
	def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
	"Enable SSE3 instructions",
	[FeatureSSE2]>;
	def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
	"Enable SSSE3 instructions",
	[FeatureSSE3]>;
	def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
	"Enable SSE 4.1 instructions",
	[FeatureSSSE3]>;
	def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
	"Enable SSE 4.2 instructions",
	[FeatureSSE41]>;
	// The MMX subtarget feature is separate from the rest of the SSE features
	// because it's important (for odd compatibility reasons) to be able to
	// turn it off explicitly while allowing SSE+ to be on.
	def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
	"Enable MMX instructions">;
	def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
	"Enable 3DNow! instructions",
	[FeatureMMX]>;
	def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
	"Enable 3DNow! Athlon instructions",
	[Feature3DNow]>;
	// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
	// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
	// without disabling 64-bit mode. Nothing should imply this feature bit. It
	// is used to enforce that only 64-bit capable CPUs are used in 64-bit mode.
	def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
	"Support 64-bit instructions">;
	def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
	"64-bit with cmpxchg16b",
	[FeatureCMPXCHG8B]>;
	def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
	"SHLD instruction is slow">;
	def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
	"PMULLD instruction is slow">;
	def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
	"true",
	"PMADDWD is slower than PMULLD">;
	// FIXME: This should not apply to CPUs that do not have SSE.
	def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
	"IsUAMem16Slow", "true",
	"Slow unaligned 16-byte memory access">;
	def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
	"IsUAMem32Slow", "true",
	"Slow unaligned 32-byte memory access">;
	def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
	"Support SSE 4a instructions",
	[FeatureSSE3]>;

	def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
	"Enable AVX instructions",
	[FeatureSSE42]>;
	def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
	"Enable AVX2 instructions",
	[FeatureAVX]>;
	def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
	"Enable three-operand fused multiple-add",
	[FeatureAVX]>;
	def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
	"Support 16-bit floating point conversion instructions",
	[FeatureAVX]>;
	def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
	"Enable AVX-512 instructions",
	[FeatureAVX2, FeatureFMA, FeatureF16C]>;
	def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
	"Enable AVX-512 Exponential and Reciprocal Instructions",
	[FeatureAVX512]>;
	def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
	"Enable AVX-512 Conflict Detection Instructions",
	[FeatureAVX512]>;
	def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
	"true", "Enable AVX-512 Population Count Instructions",
	[FeatureAVX512]>;
	def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
	"Enable AVX-512 PreFetch Instructions",
	[FeatureAVX512]>;
	def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
	"true",
	"Prefetch with Intent to Write and T1 Hint">;
	def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
	"Enable AVX-512 Doubleword and Quadword Instructions",
	[FeatureAVX512]>;
	def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
	"Enable AVX-512 Byte and Word Instructions",
	[FeatureAVX512]>;
	def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
	"Enable AVX-512 Vector Length eXtensions",
	[FeatureAVX512]>;
	def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
	"Enable AVX-512 Vector Byte Manipulation Instructions",
	[FeatureBWI]>;
	def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
	"Enable AVX-512 further Vector Byte Manipulation Instructions",
	[FeatureBWI]>;
	def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
	"Enable AVX-512 Integer Fused Multiple-Add",
	[FeatureAVX512]>;
	def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
	"Enable protection keys">;
	def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
	"Enable AVX-512 Vector Neural Network Instructions",
	[FeatureAVX512]>;
	def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
	"Support bfloat16 floating point",
	[FeatureBWI]>;
	def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
	"Enable AVX-512 Bit Algorithms",
	[FeatureBWI]>;
	def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
	"HasVP2INTERSECT", "true",
	"Enable AVX-512 vp2intersect",
	[FeatureAVX512]>;
	def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
	"Enable packed carry-less multiplication instructions",
	[FeatureSSE2]>;
	def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true",
	"Enable Galois Field Arithmetic Instructions",
	[FeatureSSE2]>;
	def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true",
	"Enable vpclmulqdq instructions",
	[FeatureAVX, FeaturePCLMUL]>;
	def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
	"Enable four-operand fused multiple-add",
	[FeatureAVX, FeatureSSE4A]>;
	def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
	"Enable XOP instructions",
	[FeatureFMA4]>;
	def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
	"HasSSEUnalignedMem", "true",
	"Allow unaligned memory operands with SSE instructions">;
	def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
	"Enable AES instructions",
	[FeatureSSE2]>;
	def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true",
	"Promote selected AES instructions to AVX512/AVX registers",
	[FeatureAVX, FeatureAES]>;
	def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
	"Enable TBM instructions">;
	def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true",
	"Enable LWP instructions">;
	def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
	"Support MOVBE instruction">;
	def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
	"Support RDRAND instruction">;
	def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
	"Support FS/GS Base instructions">;
	def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
	"Support LZCNT instruction">;
	def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
	"Support BMI instructions">;
	def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
	"Support BMI2 instructions">;
	def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
	"Support RTM instructions">;
	def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
	"Support ADX instructions">;
	def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
	"Enable SHA instructions",
	[FeatureSSE2]>;
	def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
	"Support CET Shadow-Stack instructions">;
	def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
	"Support PRFCHW instructions">;
	def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
	"Support RDSEED instruction">;
	def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
	"Support LAHF and SAHF instructions">;
	def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
	"Enable MONITORX/MWAITX timer functionality">;
	def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
	"Enable Cache Line Zero">;
	def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
	"Enable Cache Demote">;
	def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
	"Support ptwrite instruction">;
	// FIXME: This feature is deprecated in 10.0 and should not be used for
	// anything, but removing it would break IR files that may contain it in a
	// target-feature attribute.
	def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
	"Deprecated. Support MPX instructions">;
	def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
	"Use LEA for adjusting the stack pointer">;
	def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
	"HasSlowDivide32", "true",
	"Use 8-bit divide for positive values less than 256">;
	def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
	"HasSlowDivide64", "true",
	"Use 32-bit divide for positive values less than 2^32">;
	def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
	"PadShortFunctions", "true",
	"Pad short functions">;
	def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
	"Invalidate Process-Context Identifier">;
	def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
	"Enable Software Guard Extensions">;
	def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
	"Flush A Cache Line Optimized">;
	def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
	"Cache Line Write Back">;
	def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
	"Write Back No Invalidate">;
	def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
	"Support RDPID instructions">;
	def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
	"Wait and pause enhancements">;
	def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
	"Has ENQCMD instructions">;
	// On some processors, instructions that implicitly take two memory operands are
	// slow. In practice, this means that CALL, PUSH, and POP with memory operands
	// should be avoided in favor of a MOV + register CALL/PUSH/POP.
	def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
	"SlowTwoMemOps", "true",
	"Two memory operand instructions are slow">;
	def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
	"LEA instruction needs inputs at AG stage">;
	def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
	"LEA instruction with certain arguments is slow">;
	def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
	"LEA instruction with 3 ops or certain registers is slow">;
	def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
	"INC and DEC instructions are slower than ADD and SUB">;
	def FeatureSoftFloat
	: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
	"Use software floating point features">;
	def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
	"HasPOPCNTFalseDeps", "true",
	"POPCNT has a false dependency on dest register">;
	def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
	"HasLZCNTFalseDeps", "true",
	"LZCNT/TZCNT have a false dependency on dest register">;
	def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
	"platform configuration instruction">;
	// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
	// using a variable mask over multiple fixed shuffles.
	def FeatureFastVariableShuffle
	: SubtargetFeature<"fast-variable-shuffle",
	"HasFastVariableShuffle",
	"true", "Shuffles with variable masks are fast">;
	// On some X86 processors, a vzeroupper instruction should be inserted after
	// using ymm/zmm registers before executing code that may use SSE instructions.
	def FeatureInsertVZEROUPPER
	: SubtargetFeature<"vzeroupper",
	"InsertVZEROUPPER",
	"true", "Should insert vzeroupper instructions">;
	// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
	// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
	// vector FSQRT has higher throughput than the corresponding NR code.
	// The idea is that throughput bound code is likely to be vectorized, so for
	// vectorized code we should care about the throughput of SQRT operations.
	// But if the code is scalar that probably means that the code has some kind of
	// dependency and we should care more about reducing the latency.
	def FeatureFastScalarFSQRT
	: SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
	"true", "Scalar SQRT is fast (disable Newton-Raphson)">;
	def FeatureFastVectorFSQRT
	: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
	"true", "Vector SQRT is fast (disable Newton-Raphson)">;
	// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
	// be used to replace test/set sequences.
	def FeatureFastLZCNT
	: SubtargetFeature<
	"fast-lzcnt", "HasFastLZCNT", "true",
	"LZCNT instructions are as fast as most simple integer ops">;
	// If the target can efficiently decode NOPs upto 11-bytes in length.
	def FeatureFast11ByteNOP
	: SubtargetFeature<
	"fast-11bytenop", "HasFast11ByteNOP", "true",
	"Target can quickly decode up to 11 byte NOPs">;
	// If the target can efficiently decode NOPs upto 15-bytes in length.
	def FeatureFast15ByteNOP
	: SubtargetFeature<
	"fast-15bytenop", "HasFast15ByteNOP", "true",
	"Target can quickly decode up to 15 byte NOPs">;
	// Sandy Bridge and newer processors can use SHLD with the same source on both
	// inputs to implement rotate to avoid the partial flag update of the normal
	// rotate instructions.
	def FeatureFastSHLDRotate
	: SubtargetFeature<
	"fast-shld-rotate", "HasFastSHLDRotate", "true",
	"SHLD can be used as a faster rotate">;

	// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
	// "string operations"). See "REP String Enhancement" in the Intel Software
	// Development Manual. This feature essentially means that REP MOVSB will copy
	// using the largest available size instead of copying bytes one by one, making
	// it at least as fast as REPMOVS{W,D,Q}.
	def FeatureERMSB
	: SubtargetFeature<
	"ermsb", "HasERMSB", "true",
	"REP MOVS/STOS are fast">;

	// Bulldozer and newer processors can merge CMP/TEST (but not other
	// instructions) with conditional branches.
	def FeatureBranchFusion
	: SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
	"CMP/TEST can be fused with conditional branches">;

	// Sandy Bridge and newer processors have many instructions that can be
	// fused with conditional branches and pass through the CPU as a single
	// operation.
	def FeatureMacroFusion
	: SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
	"Various instructions can be fused with conditional branches">;

	// Gather is available since Haswell (AVX2 set). So technically, we can
	// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
	// Skylake Client processor has faster Gathers than HSW and performance is
	// similar to Skylake Server (AVX-512).
	def FeatureHasFastGather
	: SubtargetFeature<"fast-gather", "HasFastGather", "true",
	"Indicates if gather is reasonably fast">;

	def FeaturePrefer128Bit
	: SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
	"Prefer 128-bit AVX instructions">;

	def FeaturePrefer256Bit
	: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
	"Prefer 256-bit AVX instructions">;

	def FeaturePreferMaskRegisters
	: SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
	"Prefer AVX512 mask registers over PTEST/MOVMSK">;

	// Lower indirect calls using a special construct called a `retpoline` to
	// mitigate potential Spectre v2 attacks against them.
	def FeatureRetpolineIndirectCalls
	: SubtargetFeature<
	"retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
	"Remove speculation of indirect calls from the generated code">;

	// Lower indirect branches and switches either using conditional branch trees
	// or using a special construct called a `retpoline` to mitigate potential
	// Spectre v2 attacks against them.
	def FeatureRetpolineIndirectBranches
	: SubtargetFeature<
	"retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
	"Remove speculation of indirect branches from the generated code">;

	// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
	// `retpoline-indirect-branches` above.
	def FeatureRetpoline
	: SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
	"Remove speculation of indirect branches from the "
	"generated code, either by avoiding them entirely or "
	"lowering them with a speculation blocking construct",
	[FeatureRetpolineIndirectCalls,
	FeatureRetpolineIndirectBranches]>;

	// Rely on external thunks for the emitted retpoline calls. This allows users
	// to provide their own custom thunk definitions in highly specialized
	// environments such as a kernel that does boot-time hot patching.
	def FeatureRetpolineExternalThunk
	: SubtargetFeature<
	"retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
	"When lowering an indirect call or branch using a `retpoline`, rely "
	"on the specified user provided thunk rather than emitting one "
	"ourselves. Only has effect when combined with some other retpoline "
	"feature", [FeatureRetpolineIndirectCalls]>;

	+// Mitigate LVI attacks against indirect calls/branches and call returns
	+def FeatureLVIControlFlowIntegrity
	+ : SubtargetFeature<
	+ "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
	+ "Prevent indirect calls/branches from using a memory operand, and "
	+ "precede all indirect calls/branches from a register with an "
	+ "LFENCE instruction to serialize control flow. Also decompose RET "
	+ "instructions into a POP+LFENCE+JMP sequence.">;
	+
	+// Mitigate LVI attacks against data loads
	+def FeatureLVILoadHardening
	+ : SubtargetFeature<
	+ "lvi-load-hardening", "UseLVILoadHardening", "true",
	+ "Insert LFENCE instructions to prevent data speculatively injected "
	+ "into loads from being used maliciously.">;
	+
	// Direct Move instructions.
	def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
	"Support movdiri instruction">;
	def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
	"Support movdir64b instruction">;

	def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
	"Indicates that the BEXTR instruction is implemented as a single uop "
	"with good throughput">;

	// Combine vector math operations with shuffles into horizontal math
	// instructions if a CPU implements horizontal operations (introduced with
	// SSE3) with better latency/throughput than the alternative sequence.
	def FeatureFastHorizontalOps
	: SubtargetFeature<
	"fast-hops", "HasFastHorizontalOps", "true",
	"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
	"normal vector instructions with shuffles">;

	def FeatureFastScalarShiftMasks
	: SubtargetFeature<
	"fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
	"Prefer a left/right scalar logical shift pair over a shift+and pair">;

	def FeatureFastVectorShiftMasks
	: SubtargetFeature<
	"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
	"Prefer a left/right vector logical shift pair over a shift+and pair">;

	def FeatureUseGLMDivSqrtCosts
	: SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
	"Use Goldmont specific floating point div/sqrt costs">;

	// Merge branches using three-way conditional code.
	def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
	"ThreewayBranchProfitable", "true",
	"Merge branches to a three-way "
	"conditional branch">;

	// Enable use of alias analysis during code generation.
	def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
	"Use alias analysis during codegen">;

	// Bonnell
	def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
	// Silvermont
	def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;

	//===----------------------------------------------------------------------===//
	// Register File Description
	//===----------------------------------------------------------------------===//

	include "X86RegisterInfo.td"
	include "X86RegisterBanks.td"

	//===----------------------------------------------------------------------===//
	// Instruction Descriptions
	//===----------------------------------------------------------------------===//

	include "X86Schedule.td"
	include "X86InstrInfo.td"
	include "X86SchedPredicates.td"

	def X86InstrInfo : InstrInfo;

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Models
	//===----------------------------------------------------------------------===//

	include "X86ScheduleAtom.td"
	include "X86SchedSandyBridge.td"
	include "X86SchedHaswell.td"
	include "X86SchedBroadwell.td"
	include "X86ScheduleSLM.td"
	include "X86ScheduleZnver1.td"
	include "X86ScheduleZnver2.td"
	include "X86ScheduleBdVer2.td"
	include "X86ScheduleBtVer2.td"
	include "X86SchedSkylakeClient.td"
	include "X86SchedSkylakeServer.td"

	//===----------------------------------------------------------------------===//
	// X86 Processor Feature Lists
	//===----------------------------------------------------------------------===//

	def ProcessorFeatures {
	// Nehalem
	list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE42,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureLAHFSAHF,
	FeatureMacroFusion,
	FeatureInsertVZEROUPPER];
	list<SubtargetFeature> NHMSpecificFeatures = [];
	list<SubtargetFeature> NHMFeatures =
	!listconcat(NHMInheritableFeatures, NHMSpecificFeatures);

	// Westmere
	list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
	list<SubtargetFeature> WSMSpecificFeatures = [];
	list<SubtargetFeature> WSMInheritableFeatures =
	!listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
	list<SubtargetFeature> WSMFeatures =
	!listconcat(WSMInheritableFeatures, WSMSpecificFeatures);

	// Sandybridge
	list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
	FeatureSlowDivide64,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureSlow3OpsLEA,
	FeatureFastScalarFSQRT,
	FeatureFastSHLDRotate,
	FeatureMergeToThreeWayBranch];
	list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> SNBInheritableFeatures =
	!listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
	list<SubtargetFeature> SNBFeatures =
	!listconcat(SNBInheritableFeatures, SNBSpecificFeatures);

	// Ivybridge
	list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
	FeatureF16C,
	FeatureFSGSBase];
	list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> IVBInheritableFeatures =
	!listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
	list<SubtargetFeature> IVBFeatures =
	!listconcat(IVBInheritableFeatures, IVBSpecificFeatures);

	// Haswell
	list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
	FeatureBMI,
	FeatureBMI2,
	FeatureERMSB,
	FeatureFMA,
	FeatureINVPCID,
	FeatureLZCNT,
	FeatureMOVBE,
	FeatureFastVariableShuffle];
	list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
	FeatureLZCNTFalseDeps];
	list<SubtargetFeature> HSWInheritableFeatures =
	!listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
	list<SubtargetFeature> HSWFeatures =
	!listconcat(HSWInheritableFeatures, HSWSpecificFeatures);

	// Broadwell
	list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
	FeatureRDSEED,
	FeaturePRFCHW];
	list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
	FeatureLZCNTFalseDeps];
	list<SubtargetFeature> BDWInheritableFeatures =
	!listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
	list<SubtargetFeature> BDWFeatures =
	!listconcat(BDWInheritableFeatures, BDWSpecificFeatures);

	// Skylake
	list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
	FeatureXSAVEC,
	FeatureXSAVES,
	FeatureCLFLUSHOPT,
	FeatureFastVectorFSQRT];
	list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps,
	FeatureSGX];
	list<SubtargetFeature> SKLInheritableFeatures =
	!listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
	list<SubtargetFeature> SKLFeatures =
	!listconcat(SKLInheritableFeatures, SKLSpecificFeatures);

	// Skylake-AVX512
	list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
	FeaturePrefer256Bit,
	FeatureCDI,
	FeatureDQI,
	FeatureBWI,
	FeatureVLX,
	FeaturePKU,
	FeatureCLWB];
	list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> SKXInheritableFeatures =
	!listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
	list<SubtargetFeature> SKXFeatures =
	!listconcat(SKXInheritableFeatures, SKXSpecificFeatures);

	// Cascadelake
	list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
	list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> CLXInheritableFeatures =
	!listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
	list<SubtargetFeature> CLXFeatures =
	!listconcat(CLXInheritableFeatures, CLXSpecificFeatures);

	// Cooperlake
	list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
	list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> CPXInheritableFeatures =
	!listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
	list<SubtargetFeature> CPXFeatures =
	!listconcat(CPXInheritableFeatures, CPXSpecificFeatures);

	// Cannonlake
	list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
	FeaturePrefer256Bit,
	FeatureCDI,
	FeatureDQI,
	FeatureBWI,
	FeatureVLX,
	FeaturePKU,
	FeatureVBMI,
	FeatureIFMA,
	FeatureSHA,
	FeatureSGX];
	list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
	list<SubtargetFeature> CNLInheritableFeatures =
	!listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
	list<SubtargetFeature> CNLFeatures =
	!listconcat(CNLInheritableFeatures, CNLSpecificFeatures);

	// Icelake
	list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
	FeatureVAES,
	FeatureVBMI2,
	FeatureVNNI,
	FeatureVPCLMULQDQ,
	FeatureVPOPCNTDQ,
	FeatureGFNI,
	FeatureCLWB,
	FeatureRDPID];
	list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
	list<SubtargetFeature> ICLInheritableFeatures =
	!listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
	list<SubtargetFeature> ICLFeatures =
	!listconcat(ICLInheritableFeatures, ICLSpecificFeatures);

	// Icelake Server
	list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
	FeatureWBNOINVD,
	FeatureHasFastGather];
	list<SubtargetFeature> ICXFeatures =
	!listconcat(ICLInheritableFeatures, ICXSpecificFeatures);

	//Tigerlake
	list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
	FeatureMOVDIRI,
	FeatureMOVDIR64B,
	FeatureSHSTK];
	list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather];
	list<SubtargetFeature> TGLInheritableFeatures =
	!listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures);
	list<SubtargetFeature> TGLFeatures =
	!listconcat(ICLFeatures, TGLInheritableFeatures );

	// Atom
	list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSSE3,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureMOVBE,
	FeatureSlowTwoMemOps,
	FeatureLAHFSAHF,
	FeatureInsertVZEROUPPER];
	list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
	FeatureSlowUAMem16,
	FeatureLEAForSP,
	FeatureSlowDivide32,
	FeatureSlowDivide64,
	FeatureLEAUsesAG,
	FeaturePadShortFunctions];
	list<SubtargetFeature> AtomFeatures =
	!listconcat(AtomInheritableFeatures, AtomSpecificFeatures);

	// Silvermont
	list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
	FeaturePOPCNT,
	FeaturePCLMUL,
	FeaturePRFCHW,
	FeatureSlowLEA,
	FeatureSlowIncDec,
	FeatureRDRAND];
	list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
	FeatureSlowDivide64,
	FeatureSlowPMULLD,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> SLMInheritableFeatures =
	!listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
	list<SubtargetFeature> SLMFeatures =
	!listconcat(SLMInheritableFeatures, SLMSpecificFeatures);

	// Goldmont
	list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
	FeatureSHA,
	FeatureRDSEED,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureXSAVEC,
	FeatureXSAVES,
	FeatureCLFLUSHOPT,
	FeatureFSGSBase];
	list<SubtargetFeature> GLMSpecificFeatures = [FeatureUseGLMDivSqrtCosts,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> GLMInheritableFeatures =
	!listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
	list<SubtargetFeature> GLMFeatures =
	!listconcat(GLMInheritableFeatures, GLMSpecificFeatures);

	// Goldmont Plus
	list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
	FeatureRDPID,
	FeatureSGX];
	list<SubtargetFeature> GLPSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
	list<SubtargetFeature> GLPInheritableFeatures =
	!listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
	list<SubtargetFeature> GLPFeatures =
	!listconcat(GLPInheritableFeatures, GLPSpecificFeatures);

	// Tremont
	list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
	FeatureGFNI,
	FeatureMOVDIRI,
	FeatureMOVDIR64B,
	FeatureWAITPKG];
	list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
	list<SubtargetFeature> TRMFeatures =
	!listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
	TRMSpecificFeatures);

	// Knights Landing
	list<SubtargetFeature> KNLFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureSlowDivide64,
	FeaturePCLMUL,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureLAHFSAHF,
	FeatureSlow3OpsLEA,
	FeatureSlowIncDec,
	FeatureAES,
	FeatureRDRAND,
	FeatureF16C,
	FeatureFSGSBase,
	FeatureAVX512,
	FeatureERI,
	FeatureCDI,
	FeaturePFI,
	FeaturePREFETCHWT1,
	FeatureADX,
	FeatureRDSEED,
	FeatureMOVBE,
	FeatureLZCNT,
	FeatureBMI,
	FeatureBMI2,
	FeatureFMA,
	FeaturePRFCHW,
	FeaturePreferMaskRegisters,
	FeatureSlowTwoMemOps,
	FeatureHasFastGather,
	FeatureSlowPMADDWD];
	// TODO Add AVX5124FMAPS/AVX5124VNNIW features
	list<SubtargetFeature> KNMFeatures =
	!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);

	// Barcelona
	list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureSSE4A,
	Feature3DNowA,
	FeatureFXSR,
	FeatureNOPL,
	FeatureCMPXCHG16B,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureCMOV,
	Feature64Bit,
	FeatureFastScalarShiftMasks,
	FeatureInsertVZEROUPPER];
	list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;

	// Bobcat
	list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSSE3,
	FeatureSSE4A,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeaturePRFCHW,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureFast15ByteNOP,
	FeatureFastScalarShiftMasks,
	FeatureFastVectorShiftMasks];
	list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER];
	list<SubtargetFeature> BtVer1Features =
	!listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures);

	// Jaguar
	list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
	FeatureAES,
	FeaturePCLMUL,
	FeatureBMI,
	FeatureF16C,
	FeatureMOVBE,
	FeatureXSAVE,
	FeatureXSAVEOPT];
	list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
	FeatureFastBEXTR,
	FeatureFastHorizontalOps];
	list<SubtargetFeature> BtVer2InheritableFeatures =
	!listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
	list<SubtargetFeature> BtVer2Features =
	!listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);

	// Bulldozer
	list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureXOP,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureAES,
	FeaturePRFCHW,
	FeaturePCLMUL,
	FeatureMMX,
	FeatureFXSR,
	FeatureNOPL,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureLWP,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureFast11ByteNOP,
	FeatureFastScalarShiftMasks,
	FeatureBranchFusion,
	FeatureInsertVZEROUPPER];
	list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;

	// PileDriver
	list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
	FeatureBMI,
	FeatureTBM,
	FeatureFMA,
	FeatureFastBEXTR];
	list<SubtargetFeature> BdVer2InheritableFeatures =
	!listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
	list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;

	// Steamroller
	list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
	FeatureFSGSBase];
	list<SubtargetFeature> BdVer3InheritableFeatures =
	!listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
	list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;

	// Excavator
	list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
	FeatureBMI2,
	FeatureMWAITX];
	list<SubtargetFeature> BdVer4InheritableFeatures =
	!listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
	list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;


	// AMD Zen Processors common ISAs
	list<SubtargetFeature> ZNFeatures = [FeatureADX,
	FeatureAES,
	FeatureAVX2,
	FeatureBMI,
	FeatureBMI2,
	FeatureCLFLUSHOPT,
	FeatureCLZERO,
	FeatureCMOV,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureF16C,
	FeatureFMA,
	FeatureFSGSBase,
	FeatureFXSR,
	FeatureNOPL,
	FeatureFastLZCNT,
	FeatureLAHFSAHF,
	FeatureLZCNT,
	FeatureFastBEXTR,
	FeatureFast15ByteNOP,
	FeatureBranchFusion,
	FeatureFastScalarShiftMasks,
	FeatureMMX,
	FeatureMOVBE,
	FeatureMWAITX,
	FeaturePCLMUL,
	FeaturePOPCNT,
	FeaturePRFCHW,
	FeatureRDRAND,
	FeatureRDSEED,
	FeatureSHA,
	FeatureSSE4A,
	FeatureSlowSHLD,
	FeatureInsertVZEROUPPER,
	FeatureX87,
	FeatureXSAVE,
	FeatureXSAVEC,
	FeatureXSAVEOPT,
	FeatureXSAVES];
	list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
	FeatureRDPID,
	FeatureWBNOINVD];
	list<SubtargetFeature> ZN2Features =
	!listconcat(ZNFeatures, ZN2AdditionalFeatures);
	}

	//===----------------------------------------------------------------------===//
	// X86 processors supported.
	//===----------------------------------------------------------------------===//

	class Proc<string Name, list<SubtargetFeature> Features>
	: ProcessorModel<Name, GenericModel, Features>;

	// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
	// if i386/i486 is specifically requested.
	def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
	def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16,
	FeatureInsertVZEROUPPER]>;
	def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16,
	FeatureInsertVZEROUPPER]>;
	def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
	def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
	def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B, FeatureMMX,
	FeatureInsertVZEROUPPER]>;

	def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureCMOV, FeatureInsertVZEROUPPER]>;
	def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>;

	def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureCMOV, FeatureFXSR,
	FeatureNOPL, FeatureInsertVZEROUPPER]>;

	foreach P = ["pentium3", "pentium3m"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
	FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV,
	FeatureInsertVZEROUPPER]>;
	}

	// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
	// The intent is to enable it for pentium4 which is the current default
	// processor in a vanilla 32-bit clang compilation when no specific
	// architecture is specified. This generally gives a nice performance
	// increase on silvermont, with largely neutral behavior on other
	// contemporary large core processors.
	// pentium-m, pentium4m, prescott and nocona are included as a preventative
	// measure to avoid performance surprises, in case clang's default cpu
	// changes slightly.

	def : ProcessorModel<"pentium-m", GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
	FeatureCMOV, FeatureInsertVZEROUPPER]>;

	foreach P = ["pentium4", "pentium4m"] in {
	def : ProcessorModel<P, GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
	FeatureCMOV, FeatureInsertVZEROUPPER]>;
	}

	// Intel Quark.
	def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>;

	// Intel Core Duo.
	def : ProcessorModel<"yonah", SandyBridgeModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
	FeatureCMOV, FeatureInsertVZEROUPPER]>;

	// NetBurst.
	def : ProcessorModel<"prescott", GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
	FeatureCMOV, FeatureInsertVZEROUPPER]>;
	def : ProcessorModel<"nocona", GenericPostRAModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE3,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureInsertVZEROUPPER
	]>;

	// Intel Core 2 Solo/Duo.
	def : ProcessorModel<"core2", SandyBridgeModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSSE3,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureLAHFSAHF,
	FeatureMacroFusion,
	FeatureInsertVZEROUPPER
	]>;
	def : ProcessorModel<"penryn", SandyBridgeModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE41,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureLAHFSAHF,
	FeatureMacroFusion,
	FeatureInsertVZEROUPPER
	]>;

	// Atom CPUs.
	foreach P = ["bonnell", "atom"] in {
	def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
	}

	foreach P = ["silvermont", "slm"] in {
	def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
	}

	def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
	def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
	def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;

	// "Arrandale" along with corei3 and corei5
	foreach P = ["nehalem", "corei7"] in {
	def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
	}

	// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
	def : ProcessorModel<"westmere", SandyBridgeModel,
	ProcessorFeatures.WSMFeatures>;

	foreach P = ["sandybridge", "corei7-avx"] in {
	def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
	}

	foreach P = ["ivybridge", "core-avx-i"] in {
	def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
	}

	foreach P = ["haswell", "core-avx2"] in {
	def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
	}

	def : ProcessorModel<"broadwell", BroadwellModel,
	ProcessorFeatures.BDWFeatures>;

	def : ProcessorModel<"skylake", SkylakeClientModel,
	ProcessorFeatures.SKLFeatures>;

	// FIXME: define KNL scheduler model
	def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
	def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;

	foreach P = ["skylake-avx512", "skx"] in {
	def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
	}

	def : ProcessorModel<"cascadelake", SkylakeServerModel,
	ProcessorFeatures.CLXFeatures>;
	def : ProcessorModel<"cooperlake", SkylakeServerModel,
	ProcessorFeatures.CPXFeatures>;
	def : ProcessorModel<"cannonlake", SkylakeServerModel,
	ProcessorFeatures.CNLFeatures>;
	def : ProcessorModel<"icelake-client", SkylakeServerModel,
	ProcessorFeatures.ICLFeatures>;
	def : ProcessorModel<"icelake-server", SkylakeServerModel,
	ProcessorFeatures.ICXFeatures>;
	def : ProcessorModel<"tigerlake", SkylakeServerModel,
	ProcessorFeatures.TGLFeatures>;

	// AMD CPUs.

	def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureInsertVZEROUPPER]>;
	def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	Feature3DNow, FeatureInsertVZEROUPPER]>;
	def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	Feature3DNow, FeatureInsertVZEROUPPER]>;

	foreach P = ["athlon", "athlon-tbird"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
	Feature3DNowA, FeatureNOPL, FeatureSlowSHLD,
	FeatureInsertVZEROUPPER]>;
	}

	foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
	FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
	FeatureSlowSHLD, FeatureInsertVZEROUPPER]>;
	}

	foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
	Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
	FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
	}

	foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
	Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
	FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
	FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
	}

	foreach P = ["amdfam10", "barcelona"] in {
	def : Proc<P, ProcessorFeatures.BarcelonaFeatures>;
	}

	// Bobcat
	def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
	// Jaguar
	def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;

	// Bulldozer
	def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
	// Piledriver
	def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
	// Steamroller
	def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
	// Excavator
	def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;

	def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
	def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>;

	def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	Feature3DNowA, FeatureInsertVZEROUPPER]>;

	def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
	FeatureInsertVZEROUPPER]>;
	def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
	FeatureInsertVZEROUPPER]>;
	def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
	FeatureInsertVZEROUPPER]>;
	def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE1, FeatureFXSR,
	FeatureCMOV, FeatureInsertVZEROUPPER]>;

	// We also provide a generic 64-bit specific x86 processor model which tries to
	// be good for modern chips without enabling instruction set encodings past the
	// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
	// modern 64-bit x86 chip, and enables features that are generally beneficial.
	//
	// We currently use the Sandy Bridge model as the default scheduling model as
	// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
	// covers a huge swath of x86 processors. If there are specific scheduling
	// knobs which need to be tuned differently for AMD chips, we might consider
	// forming a common base for them.
	def : ProcessorModel<"x86-64", SandyBridgeModel, [
	FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE2,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureSlow3OpsLEA,
	FeatureSlowIncDec,
	FeatureMacroFusion,
	FeatureInsertVZEROUPPER
	]>;

	//===----------------------------------------------------------------------===//
	// Calling Conventions
	//===----------------------------------------------------------------------===//

	include "X86CallingConv.td"


	//===----------------------------------------------------------------------===//
	// Assembly Parser
	//===----------------------------------------------------------------------===//

	def ATTAsmParserVariant : AsmParserVariant {
	int Variant = 0;

	// Variant name.
	string Name = "att";

	// Discard comments in assembly strings.
	string CommentDelimiter = "#";

	// Recognize hard coded registers.
	string RegisterPrefix = "%";
	}

	def IntelAsmParserVariant : AsmParserVariant {
	int Variant = 1;

	// Variant name.
	string Name = "intel";

	// Discard comments in assembly strings.
	string CommentDelimiter = ";";

	// Recognize hard coded registers.
	string RegisterPrefix = "";
	}

	//===----------------------------------------------------------------------===//
	// Assembly Printers
	//===----------------------------------------------------------------------===//

	// The X86 target supports two different syntaxes for emitting machine code.
	// This is controlled by the -x86-asm-syntax={att\|intel}
	def ATTAsmWriter : AsmWriter {
	string AsmWriterClassName = "ATTInstPrinter";
	int Variant = 0;
	}
	def IntelAsmWriter : AsmWriter {
	string AsmWriterClassName = "IntelInstPrinter";
	int Variant = 1;
	}

	def X86 : Target {
	// Information about the instructions...
	let InstructionSet = X86InstrInfo;
	let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
	let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
	let AllowRegisterRenaming = 1;
	}

	//===----------------------------------------------------------------------===//
	// Pfm Counters
	//===----------------------------------------------------------------------===//

	include "X86PfmCounters.td"
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp (revision 362609)
	@@ -1,4013 +1,4013 @@
	//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the X86-specific support for the FastISel class. Much
	// of the target-specific code is generated by tablegen in the file
	// X86GenFastISel.inc, which is #included here.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86CallingConv.h"
	#include "X86InstrBuilder.h"
	#include "X86InstrInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86RegisterInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/CodeGen/FastISel.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/IntrinsicsX86.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Target/TargetOptions.h"
	using namespace llvm;

	namespace {

	class X86FastISel final : public FastISel {
	/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget *Subtarget;

	/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
	/// floating point ops.
	/// When SSE is available, use it for f32 operations.
	/// When SSE2 is available, use it for f64 operations.
	bool X86ScalarSSEf64;
	bool X86ScalarSSEf32;

	public:
	explicit X86FastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo)
	: FastISel(funcInfo, libInfo) {
	Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
	X86ScalarSSEf64 = Subtarget->hasSSE2();
	X86ScalarSSEf32 = Subtarget->hasSSE1();
	}

	bool fastSelectInstruction(const Instruction *I) override;

	/// The specified machine instr operand is a vreg, and that
	/// vreg is being provided by the specified load instruction. If possible,
	/// try to fold the load as an operand to the instruction, returning true if
	/// possible.
	bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
	const LoadInst *LI) override;

	bool fastLowerArguments() override;
	bool fastLowerCall(CallLoweringInfo &CLI) override;
	bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;

	#include "X86GenFastISel.inc"

	private:
	bool X86FastEmitCompare(const Value LHS, const Value RHS, EVT VT,
	const DebugLoc &DL);

	bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
	unsigned &ResultReg, unsigned Alignment = 1);

	bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
	MachineMemOperand *MMO = nullptr, bool Aligned = false);
	bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
	X86AddressMode &AM,
	MachineMemOperand *MMO = nullptr, bool Aligned = false);

	bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
	unsigned &ResultReg);

	bool X86SelectAddress(const Value *V, X86AddressMode &AM);
	bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);

	bool X86SelectLoad(const Instruction *I);

	bool X86SelectStore(const Instruction *I);

	bool X86SelectRet(const Instruction *I);

	bool X86SelectCmp(const Instruction *I);

	bool X86SelectZExt(const Instruction *I);

	bool X86SelectSExt(const Instruction *I);

	bool X86SelectBranch(const Instruction *I);

	bool X86SelectShift(const Instruction *I);

	bool X86SelectDivRem(const Instruction *I);

	bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);

	bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);

	bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);

	bool X86SelectSelect(const Instruction *I);

	bool X86SelectTrunc(const Instruction *I);

	bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
	const TargetRegisterClass *RC);

	bool X86SelectFPExt(const Instruction *I);
	bool X86SelectFPTrunc(const Instruction *I);
	bool X86SelectSIToFP(const Instruction *I);
	bool X86SelectUIToFP(const Instruction *I);
	bool X86SelectIntToFP(const Instruction *I, bool IsSigned);

	const X86InstrInfo *getInstrInfo() const {
	return Subtarget->getInstrInfo();
	}
	const X86TargetMachine *getTargetMachine() const {
	return static_cast<const X86TargetMachine *>(&TM);
	}

	bool handleConstantAddresses(const Value *V, X86AddressMode &AM);

	unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
	unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
	unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
	unsigned fastMaterializeConstant(const Constant *C) override;

	unsigned fastMaterializeAlloca(const AllocaInst *C) override;

	unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;

	/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
	/// computed in an SSE register, not on the X87 floating point stack.
	bool isScalarFPTypeInSSEReg(EVT VT) const {
	return (VT == MVT::f64 && X86ScalarSSEf64) \|\| // f64 is when SSE2
	(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
	}

	bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);

	bool IsMemcpySmall(uint64_t Len);

	bool TryEmitSmallMemcpy(X86AddressMode DestAM,
	X86AddressMode SrcAM, uint64_t Len);

	bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
	const Value *Cond);

	const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
	X86AddressMode &AM);

	unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
	const TargetRegisterClass *RC, unsigned Op0,
	bool Op0IsKill, unsigned Op1, bool Op1IsKill,
	unsigned Op2, bool Op2IsKill, unsigned Op3,
	bool Op3IsKill);
	};

	} // end anonymous namespace.

	static std::pair<unsigned, bool>
	getX86SSEConditionCode(CmpInst::Predicate Predicate) {
	unsigned CC;
	bool NeedSwap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (Predicate) {
	default: llvm_unreachable("Unexpected predicate");
	case CmpInst::FCMP_OEQ: CC = 0; break;
	case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_OLT: CC = 1; break;
	case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_OLE: CC = 2; break;
	case CmpInst::FCMP_UNO: CC = 3; break;
	case CmpInst::FCMP_UNE: CC = 4; break;
	case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_UGE: CC = 5; break;
	case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_UGT: CC = 6; break;
	case CmpInst::FCMP_ORD: CC = 7; break;
	case CmpInst::FCMP_UEQ: CC = 8; break;
	case CmpInst::FCMP_ONE: CC = 12; break;
	}

	return std::make_pair(CC, NeedSwap);
	}

	/// Adds a complex addressing mode to the given machine instr builder.
	/// Note, this will constrain the index register. If its not possible to
	/// constrain the given index register, then a new one will be created. The
	/// IndexReg field of the addressing mode will be updated to match in this case.
	const MachineInstrBuilder &
	X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
	X86AddressMode &AM) {
	// First constrain the index register. It needs to be a GR64_NOSP.
	AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
	MIB->getNumOperands() +
	X86::AddrIndexReg);
	return ::addFullAddress(MIB, AM);
	}

	/// Check if it is possible to fold the condition from the XALU intrinsic
	/// into the user. The condition code will only be updated on success.
	bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
	const Value *Cond) {
	if (!isa<ExtractValueInst>(Cond))
	return false;

	const auto *EV = cast<ExtractValueInst>(Cond);
	if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
	return false;

	const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
	MVT RetVT;
	const Function *Callee = II->getCalledFunction();
	Type *RetTy =
	cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
	if (!isTypeLegal(RetTy, RetVT))
	return false;

	if (RetVT != MVT::i32 && RetVT != MVT::i64)
	return false;

	X86::CondCode TmpCC;
	switch (II->getIntrinsicID()) {
	default: return false;
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
	}

	// Check if both instructions are in the same basic block.
	if (II->getParent() != I->getParent())
	return false;

	// Make sure nothing is in the way
	BasicBlock::const_iterator Start(I);
	BasicBlock::const_iterator End(II);
	for (auto Itr = std::prev(Start); Itr != End; --Itr) {
	// We only expect extractvalue instructions between the intrinsic and the
	// instruction to be selected.
	if (!isa<ExtractValueInst>(Itr))
	return false;

	// Check that the extractvalue operand comes from the intrinsic.
	const auto *EVI = cast<ExtractValueInst>(Itr);
	if (EVI->getAggregateOperand() != II)
	return false;
	}

	CC = TmpCC;
	return true;
	}

	bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
	EVT evt = TLI.getValueType(DL, Ty, /AllowUnknown=/true);
	if (evt == MVT::Other \|\| !evt.isSimple())
	// Unhandled type. Halt "fast" selection and bail.
	return false;

	VT = evt.getSimpleVT();
	// For now, require SSE/SSE2 for performing floating-point operations,
	// since x87 requires additional work.
	if (VT == MVT::f64 && !X86ScalarSSEf64)
	return false;
	if (VT == MVT::f32 && !X86ScalarSSEf32)
	return false;
	// Similarly, no f80 support yet.
	if (VT == MVT::f80)
	return false;
	// We only handle legal types. For example, on x86-32 the instruction
	// selector contains all of the 64-bit instructions from x86-64,
	// under the assumption that i64 won't be used if the target doesn't
	// support it.
	return (AllowI1 && VT == MVT::i1) \|\| TLI.isTypeLegal(VT);
	}

	/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
	/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
	/// Return true and the result register by reference if it is possible.
	bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
	MachineMemOperand *MMO, unsigned &ResultReg,
	unsigned Alignment) {
	bool HasSSE41 = Subtarget->hasSSE41();
	bool HasAVX = Subtarget->hasAVX();
	bool HasAVX2 = Subtarget->hasAVX2();
	bool HasAVX512 = Subtarget->hasAVX512();
	bool HasVLX = Subtarget->hasVLX();
	bool IsNonTemporal = MMO && MMO->isNonTemporal();

	// Treat i1 loads the same as i8 loads. Masking will be done when storing.
	if (VT == MVT::i1)
	VT = MVT::i8;

	// Get opcode and regclass of the output for the given load instruction.
	unsigned Opc = 0;
	switch (VT.SimpleTy) {
	default: return false;
	case MVT::i8:
	Opc = X86::MOV8rm;
	break;
	case MVT::i16:
	Opc = X86::MOV16rm;
	break;
	case MVT::i32:
	Opc = X86::MOV32rm;
	break;
	case MVT::i64:
	// Must be in x86-64 mode.
	Opc = X86::MOV64rm;
	break;
	case MVT::f32:
	if (X86ScalarSSEf32)
	Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
	HasAVX ? X86::VMOVSSrm_alt :
	X86::MOVSSrm_alt;
	else
	Opc = X86::LD_Fp32m;
	break;
	case MVT::f64:
	if (X86ScalarSSEf64)
	Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
	HasAVX ? X86::VMOVSDrm_alt :
	X86::MOVSDrm_alt;
	else
	Opc = X86::LD_Fp64m;
	break;
	case MVT::f80:
	// No f80 support yet.
	return false;
	case MVT::v4f32:
	if (IsNonTemporal && Alignment >= 16 && HasSSE41)
	Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
	HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
	else if (Alignment >= 16)
	Opc = HasVLX ? X86::VMOVAPSZ128rm :
	HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
	else
	Opc = HasVLX ? X86::VMOVUPSZ128rm :
	HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
	break;
	case MVT::v2f64:
	if (IsNonTemporal && Alignment >= 16 && HasSSE41)
	Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
	HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
	else if (Alignment >= 16)
	Opc = HasVLX ? X86::VMOVAPDZ128rm :
	HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
	else
	Opc = HasVLX ? X86::VMOVUPDZ128rm :
	HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
	break;
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v8i16:
	case MVT::v16i8:
	if (IsNonTemporal && Alignment >= 16 && HasSSE41)
	Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
	HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
	else if (Alignment >= 16)
	Opc = HasVLX ? X86::VMOVDQA64Z128rm :
	HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
	else
	Opc = HasVLX ? X86::VMOVDQU64Z128rm :
	HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
	break;
	case MVT::v8f32:
	assert(HasAVX);
	if (IsNonTemporal && Alignment >= 32 && HasAVX2)
	Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
	else if (IsNonTemporal && Alignment >= 16)
	return false; // Force split for X86::VMOVNTDQArm
	else if (Alignment >= 32)
	Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
	else
	Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
	break;
	case MVT::v4f64:
	assert(HasAVX);
	if (IsNonTemporal && Alignment >= 32 && HasAVX2)
	Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
	else if (IsNonTemporal && Alignment >= 16)
	return false; // Force split for X86::VMOVNTDQArm
	else if (Alignment >= 32)
	Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
	else
	Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
	break;
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v16i16:
	case MVT::v32i8:
	assert(HasAVX);
	if (IsNonTemporal && Alignment >= 32 && HasAVX2)
	Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
	else if (IsNonTemporal && Alignment >= 16)
	return false; // Force split for X86::VMOVNTDQArm
	else if (Alignment >= 32)
	Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
	else
	Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
	break;
	case MVT::v16f32:
	assert(HasAVX512);
	if (IsNonTemporal && Alignment >= 64)
	Opc = X86::VMOVNTDQAZrm;
	else
	Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
	break;
	case MVT::v8f64:
	assert(HasAVX512);
	if (IsNonTemporal && Alignment >= 64)
	Opc = X86::VMOVNTDQAZrm;
	else
	Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
	break;
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8:
	assert(HasAVX512);
	// Note: There are a lot more choices based on type with AVX-512, but
	// there's really no advantage when the load isn't masked.
	if (IsNonTemporal && Alignment >= 64)
	Opc = X86::VMOVNTDQAZrm;
	else
	Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
	break;
	}

	const TargetRegisterClass *RC = TLI.getRegClassFor(VT);

	ResultReg = createResultReg(RC);
	MachineInstrBuilder MIB =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
	addFullAddress(MIB, AM);
	if (MMO)
	MIB->addMemOperand(*FuncInfo.MF, MMO);
	return true;
	}

	/// X86FastEmitStore - Emit a machine instruction to store a value Val of
	/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
	/// and a displacement offset, or a GlobalAddress,
	/// i.e. V. Return true if it is possible.
	bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
	X86AddressMode &AM,
	MachineMemOperand *MMO, bool Aligned) {
	bool HasSSE1 = Subtarget->hasSSE1();
	bool HasSSE2 = Subtarget->hasSSE2();
	bool HasSSE4A = Subtarget->hasSSE4A();
	bool HasAVX = Subtarget->hasAVX();
	bool HasAVX512 = Subtarget->hasAVX512();
	bool HasVLX = Subtarget->hasVLX();
	bool IsNonTemporal = MMO && MMO->isNonTemporal();

	// Get opcode and regclass of the output for the given store instruction.
	unsigned Opc = 0;
	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f80: // No f80 support yet.
	default: return false;
	case MVT::i1: {
	// Mask out all but lowest bit.
	unsigned AndResult = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(X86::AND8ri), AndResult)
	.addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
	ValReg = AndResult;
	LLVM_FALLTHROUGH; // handle i1 as i8.
	}
	case MVT::i8: Opc = X86::MOV8mr; break;
	case MVT::i16: Opc = X86::MOV16mr; break;
	case MVT::i32:
	Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
	break;
	case MVT::i64:
	// Must be in x86-64 mode.
	Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
	break;
	case MVT::f32:
	if (X86ScalarSSEf32) {
	if (IsNonTemporal && HasSSE4A)
	Opc = X86::MOVNTSS;
	else
	Opc = HasAVX512 ? X86::VMOVSSZmr :
	HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
	} else
	Opc = X86::ST_Fp32m;
	break;
	case MVT::f64:
	if (X86ScalarSSEf32) {
	if (IsNonTemporal && HasSSE4A)
	Opc = X86::MOVNTSD;
	else
	Opc = HasAVX512 ? X86::VMOVSDZmr :
	HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
	} else
	Opc = X86::ST_Fp64m;
	break;
	case MVT::x86mmx:
	Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
	break;
	case MVT::v4f32:
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPSZ128mr :
	HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
	else
	Opc = HasVLX ? X86::VMOVAPSZ128mr :
	HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
	} else
	Opc = HasVLX ? X86::VMOVUPSZ128mr :
	HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
	break;
	case MVT::v2f64:
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPDZ128mr :
	HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
	else
	Opc = HasVLX ? X86::VMOVAPDZ128mr :
	HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
	} else
	Opc = HasVLX ? X86::VMOVUPDZ128mr :
	HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
	break;
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v8i16:
	case MVT::v16i8:
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTDQZ128mr :
	HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
	else
	Opc = HasVLX ? X86::VMOVDQA64Z128mr :
	HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
	} else
	Opc = HasVLX ? X86::VMOVDQU64Z128mr :
	HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
	break;
	case MVT::v8f32:
	assert(HasAVX);
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
	else
	Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
	} else
	Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
	break;
	case MVT::v4f64:
	assert(HasAVX);
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
	else
	Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
	} else
	Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
	break;
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v16i16:
	case MVT::v32i8:
	assert(HasAVX);
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
	else
	Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
	} else
	Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
	break;
	case MVT::v16f32:
	assert(HasAVX512);
	if (Aligned)
	Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
	else
	Opc = X86::VMOVUPSZmr;
	break;
	case MVT::v8f64:
	assert(HasAVX512);
	if (Aligned) {
	Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
	} else
	Opc = X86::VMOVUPDZmr;
	break;
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8:
	assert(HasAVX512);
	// Note: There are a lot more choices based on type with AVX-512, but
	// there's really no advantage when the store isn't masked.
	if (Aligned)
	Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
	else
	Opc = X86::VMOVDQU64Zmr;
	break;
	}

	const MCInstrDesc &Desc = TII.get(Opc);
	// Some of the instructions in the previous switch use FR128 instead
	// of FR32 for ValReg. Make sure the register we feed the instruction
	// matches its register class constraints.
	// Note: This is fine to do a copy from FR32 to FR128, this is the
	// same registers behind the scene and actually why it did not trigger
	// any bugs before.
	ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
	MachineInstrBuilder MIB =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
	addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
	if (MMO)
	MIB->addMemOperand(*FuncInfo.MF, MMO);

	return true;
	}

	bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
	X86AddressMode &AM,
	MachineMemOperand *MMO, bool Aligned) {
	// Handle 'null' like i32/i64 0.
	if (isa<ConstantPointerNull>(Val))
	Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));

	// If this is a store of a simple constant, fold the constant into the store.
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
	unsigned Opc = 0;
	bool Signed = true;
	switch (VT.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i1:
	Signed = false;
	LLVM_FALLTHROUGH; // Handle as i8.
	case MVT::i8: Opc = X86::MOV8mi; break;
	case MVT::i16: Opc = X86::MOV16mi; break;
	case MVT::i32: Opc = X86::MOV32mi; break;
	case MVT::i64:
	// Must be a 32-bit sign extended value.
	if (isInt<32>(CI->getSExtValue()))
	Opc = X86::MOV64mi32;
	break;
	}

	if (Opc) {
	MachineInstrBuilder MIB =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
	addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
	: CI->getZExtValue());
	if (MMO)
	MIB->addMemOperand(*FuncInfo.MF, MMO);
	return true;
	}
	}

	unsigned ValReg = getRegForValue(Val);
	if (ValReg == 0)
	return false;

	bool ValKill = hasTrivialKill(Val);
	return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
	}

	/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
	/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
	/// ISD::SIGN_EXTEND).
	bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
	unsigned Src, EVT SrcVT,
	unsigned &ResultReg) {
	unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
	Src, /TODO: Kill=/false);
	if (RR == 0)
	return false;

	ResultReg = RR;
	return true;
	}

	bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
	// Handle constant address.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
	// Can't handle alternate code models yet.
	if (TM.getCodeModel() != CodeModel::Small)
	return false;

	// Can't handle TLS yet.
	if (GV->isThreadLocal())
	return false;

	// Can't handle !absolute_symbol references yet.
	if (GV->isAbsoluteSymbolRef())
	return false;

	// RIP-relative addresses can't have additional register operands, so if
	// we've already folded stuff into the addressing mode, just force the
	// global value into its own register, which we can use as the basereg.
	if (!Subtarget->isPICStyleRIPRel() \|\|
	(AM.Base.Reg == 0 && AM.IndexReg == 0)) {
	// Okay, we've committed to selecting this global. Set up the address.
	AM.GV = GV;

	// Allow the subtarget to classify the global.
	unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);

	// If this reference is relative to the pic base, set it now.
	if (isGlobalRelativeToPICBase(GVFlags)) {
	// FIXME: How do we know Base.Reg is free??
	AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	}

	// Unless the ABI requires an extra load, return a direct reference to
	// the global.
	if (!isGlobalStubReference(GVFlags)) {
	if (Subtarget->isPICStyleRIPRel()) {
	// Use rip-relative addressing if we can. Above we verified that the
	// base and index registers are unused.
	assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
	AM.Base.Reg = X86::RIP;
	}
	AM.GVOpFlags = GVFlags;
	return true;
	}

	// Ok, we need to do a load from a stub. If we've already loaded from
	// this stub, reuse the loaded pointer, otherwise emit the load now.
	DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
	unsigned LoadReg;
	if (I != LocalValueMap.end() && I->second != 0) {
	LoadReg = I->second;
	} else {
	// Issue load from stub.
	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	X86AddressMode StubAM;
	StubAM.Base.Reg = AM.Base.Reg;
	StubAM.GV = GV;
	StubAM.GVOpFlags = GVFlags;

	// Prepare for inserting code in the local-value area.
	SavePoint SaveInsertPt = enterLocalValueArea();

	if (TLI.getPointerTy(DL) == MVT::i64) {
	Opc = X86::MOV64rm;
	RC = &X86::GR64RegClass;

	if (Subtarget->isPICStyleRIPRel())
	StubAM.Base.Reg = X86::RIP;
	} else {
	Opc = X86::MOV32rm;
	RC = &X86::GR32RegClass;
	}

	LoadReg = createResultReg(RC);
	MachineInstrBuilder LoadMI =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
	addFullAddress(LoadMI, StubAM);

	// Ok, back to normal mode.
	leaveLocalValueArea(SaveInsertPt);

	// Prevent loading GV stub multiple times in same MBB.
	LocalValueMap[V] = LoadReg;
	}

	// Now construct the final address. Note that the Disp, Scale,
	// and Index values may already be set here.
	AM.Base.Reg = LoadReg;
	AM.GV = nullptr;
	return true;
	}
	}

	// If all else fails, try to materialize the value in a register.
	if (!AM.GV \|\| !Subtarget->isPICStyleRIPRel()) {
	if (AM.Base.Reg == 0) {
	AM.Base.Reg = getRegForValue(V);
	return AM.Base.Reg != 0;
	}
	if (AM.IndexReg == 0) {
	assert(AM.Scale == 1 && "Scale with no index!");
	AM.IndexReg = getRegForValue(V);
	return AM.IndexReg != 0;
	}
	}

	return false;
	}

	/// X86SelectAddress - Attempt to fill in an address from the given value.
	///
	bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
	SmallVector<const Value *, 32> GEPs;
	redo_gep:
	const User *U = nullptr;
	unsigned Opcode = Instruction::UserOp1;
	if (const Instruction *I = dyn_cast<Instruction>(V)) {
	// Don't walk into other basic blocks; it's possible we haven't
	// visited them yet, so the instructions may not yet be assigned
	// virtual registers.
	if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) \|\|
	FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
	Opcode = I->getOpcode();
	U = I;
	}
	} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
	Opcode = C->getOpcode();
	U = C;
	}

	if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
	if (Ty->getAddressSpace() > 255)
	// Fast instruction selection doesn't support the special
	// address spaces.
	return false;

	switch (Opcode) {
	default: break;
	case Instruction::BitCast:
	// Look past bitcasts.
	return X86SelectAddress(U->getOperand(0), AM);

	case Instruction::IntToPtr:
	// Look past no-op inttoptrs.
	if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
	TLI.getPointerTy(DL))
	return X86SelectAddress(U->getOperand(0), AM);
	break;

	case Instruction::PtrToInt:
	// Look past no-op ptrtoints.
	if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
	return X86SelectAddress(U->getOperand(0), AM);
	break;

	case Instruction::Alloca: {
	// Do static allocas.
	const AllocaInst *A = cast<AllocaInst>(V);
	DenseMap<const AllocaInst *, int>::iterator SI =
	FuncInfo.StaticAllocaMap.find(A);
	if (SI != FuncInfo.StaticAllocaMap.end()) {
	AM.BaseType = X86AddressMode::FrameIndexBase;
	AM.Base.FrameIndex = SI->second;
	return true;
	}
	break;
	}

	case Instruction::Add: {
	// Adds of constants are common and easy enough.
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
	uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
	// They have to fit in the 32-bit signed displacement field though.
	if (isInt<32>(Disp)) {
	AM.Disp = (uint32_t)Disp;
	return X86SelectAddress(U->getOperand(0), AM);
	}
	}
	break;
	}

	case Instruction::GetElementPtr: {
	X86AddressMode SavedAM = AM;

	// Pattern-match simple GEPs.
	uint64_t Disp = (int32_t)AM.Disp;
	unsigned IndexReg = AM.IndexReg;
	unsigned Scale = AM.Scale;
	gep_type_iterator GTI = gep_type_begin(U);
	// Iterate through the indices, folding what we can. Constants can be
	// folded, and one dynamic index can be handled, if the scale is supported.
	for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
	i != e; ++i, ++GTI) {
	const Value Op = i;
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	const StructLayout *SL = DL.getStructLayout(STy);
	Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
	continue;
	}

	// A array/variable index is always of the form i*S where S is the
	// constant scale size. See if we can push the scale into immediates.
	uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
	for (;;) {
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
	// Constant-offset addressing.
	Disp += CI->getSExtValue() * S;
	break;
	}
	if (canFoldAddIntoGEP(U, Op)) {
	// A compatible add with a constant operand. Fold the constant.
	ConstantInt *CI =
	cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
	Disp += CI->getSExtValue() * S;
	// Iterate on the other operand.
	Op = cast<AddOperator>(Op)->getOperand(0);
	continue;
	}
	if (IndexReg == 0 &&
	(!AM.GV \|\| !Subtarget->isPICStyleRIPRel()) &&
	(S == 1 \|\| S == 2 \|\| S == 4 \|\| S == 8)) {
	// Scaled-index addressing.
	Scale = S;
	IndexReg = getRegForGEPIndex(Op).first;
	if (IndexReg == 0)
	return false;
	break;
	}
	// Unsupported.
	goto unsupported_gep;
	}
	}

	// Check for displacement overflow.
	if (!isInt<32>(Disp))
	break;

	AM.IndexReg = IndexReg;
	AM.Scale = Scale;
	AM.Disp = (uint32_t)Disp;
	GEPs.push_back(V);

	if (const GetElementPtrInst *GEP =
	dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
	// Ok, the GEP indices were covered by constant-offset and scaled-index
	// addressing. Update the address state and move on to examining the base.
	V = GEP;
	goto redo_gep;
	} else if (X86SelectAddress(U->getOperand(0), AM)) {
	return true;
	}

	// If we couldn't merge the gep value into this addr mode, revert back to
	// our address and just match the value instead of completely failing.
	AM = SavedAM;

	for (const Value *I : reverse(GEPs))
	if (handleConstantAddresses(I, AM))
	return true;

	return false;
	unsupported_gep:
	// Ok, the GEP indices weren't all covered.
	break;
	}
	}

	return handleConstantAddresses(V, AM);
	}

	/// X86SelectCallAddress - Attempt to fill in an address from the given value.
	///
	bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
	const User *U = nullptr;
	unsigned Opcode = Instruction::UserOp1;
	const Instruction *I = dyn_cast<Instruction>(V);
	// Record if the value is defined in the same basic block.
	//
	// This information is crucial to know whether or not folding an
	// operand is valid.
	// Indeed, FastISel generates or reuses a virtual register for all
	// operands of all instructions it selects. Obviously, the definition and
	// its uses must use the same virtual register otherwise the produced
	// code is incorrect.
	// Before instruction selection, FunctionLoweringInfo::set sets the virtual
	// registers for values that are alive across basic blocks. This ensures
	// that the values are consistently set between across basic block, even
	// if different instruction selection mechanisms are used (e.g., a mix of
	// SDISel and FastISel).
	// For values local to a basic block, the instruction selection process
	// generates these virtual registers with whatever method is appropriate
	// for its needs. In particular, FastISel and SDISel do not share the way
	// local virtual registers are set.
	// Therefore, this is impossible (or at least unsafe) to share values
	// between basic blocks unless they use the same instruction selection
	// method, which is not guarantee for X86.
	// Moreover, things like hasOneUse could not be used accurately, if we
	// allow to reference values across basic blocks whereas they are not
	// alive across basic blocks initially.
	bool InMBB = true;
	if (I) {
	Opcode = I->getOpcode();
	U = I;
	InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
	} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
	Opcode = C->getOpcode();
	U = C;
	}

	switch (Opcode) {
	default: break;
	case Instruction::BitCast:
	// Look past bitcasts if its operand is in the same BB.
	if (InMBB)
	return X86SelectCallAddress(U->getOperand(0), AM);
	break;

	case Instruction::IntToPtr:
	// Look past no-op inttoptrs if its operand is in the same BB.
	if (InMBB &&
	TLI.getValueType(DL, U->getOperand(0)->getType()) ==
	TLI.getPointerTy(DL))
	return X86SelectCallAddress(U->getOperand(0), AM);
	break;

	case Instruction::PtrToInt:
	// Look past no-op ptrtoints if its operand is in the same BB.
	if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
	return X86SelectCallAddress(U->getOperand(0), AM);
	break;
	}

	// Handle constant address.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
	// Can't handle alternate code models yet.
	if (TM.getCodeModel() != CodeModel::Small)
	return false;

	// RIP-relative addresses can't have additional register operands.
	if (Subtarget->isPICStyleRIPRel() &&
	(AM.Base.Reg != 0 \|\| AM.IndexReg != 0))
	return false;

	// Can't handle TLS.
	if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
	if (GVar->isThreadLocal())
	return false;

	// Okay, we've committed to selecting this global. Set up the basic address.
	AM.GV = GV;

	// Return a direct reference to the global. Fastisel can handle calls to
	// functions that require loads, such as dllimport and nonlazybind
	// functions.
	if (Subtarget->isPICStyleRIPRel()) {
	// Use rip-relative addressing if we can. Above we verified that the
	// base and index registers are unused.
	assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
	AM.Base.Reg = X86::RIP;
	} else {
	AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
	}

	return true;
	}

	// If all else fails, try to materialize the value in a register.
	if (!AM.GV \|\| !Subtarget->isPICStyleRIPRel()) {
	if (AM.Base.Reg == 0) {
	AM.Base.Reg = getRegForValue(V);
	return AM.Base.Reg != 0;
	}
	if (AM.IndexReg == 0) {
	assert(AM.Scale == 1 && "Scale with no index!");
	AM.IndexReg = getRegForValue(V);
	return AM.IndexReg != 0;
	}
	}

	return false;
	}


	/// X86SelectStore - Select and emit code to implement store instructions.
	bool X86FastISel::X86SelectStore(const Instruction *I) {
	// Atomic stores need special handling.
	const StoreInst *S = cast<StoreInst>(I);

	if (S->isAtomic())
	return false;

	const Value *PtrV = I->getOperand(1);
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
	if (Arg->hasSwiftErrorAttr())
	return false;
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
	if (Alloca->isSwiftError())
	return false;
	}
	}

	const Value *Val = S->getValueOperand();
	const Value *Ptr = S->getPointerOperand();

	MVT VT;
	if (!isTypeLegal(Val->getType(), VT, /AllowI1=/true))
	return false;

	unsigned Alignment = S->getAlignment();
	unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = ABIAlignment;
	bool Aligned = Alignment >= ABIAlignment;

	X86AddressMode AM;
	if (!X86SelectAddress(Ptr, AM))
	return false;

	return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
	}

	/// X86SelectRet - Select and emit code to implement ret instructions.
	bool X86FastISel::X86SelectRet(const Instruction *I) {
	const ReturnInst *Ret = cast<ReturnInst>(I);
	const Function &F = *I->getParent()->getParent();
	const X86MachineFunctionInfo *X86MFInfo =
	FuncInfo.MF->getInfo<X86MachineFunctionInfo>();

	if (!FuncInfo.CanLowerReturn)
	return false;

	if (TLI.supportSwiftError() &&
	F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	return false;

	if (TLI.supportSplitCSR(FuncInfo.MF))
	return false;

	CallingConv::ID CC = F.getCallingConv();
	if (CC != CallingConv::C &&
	CC != CallingConv::Fast &&
	CC != CallingConv::Tail &&
	CC != CallingConv::X86_FastCall &&
	CC != CallingConv::X86_StdCall &&
	CC != CallingConv::X86_ThisCall &&
	CC != CallingConv::X86_64_SysV &&
	CC != CallingConv::Win64)
	return false;

	// Don't handle popping bytes if they don't fit the ret's immediate.
	if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
	return false;

	// fastcc with -tailcallopt is intended to provide a guaranteed
	// tail call optimization. Fastisel doesn't know how to do that.
	if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) \|\|
	CC == CallingConv::Tail)
	return false;

	// Let SDISel handle vararg functions.
	if (F.isVarArg())
	return false;

	// Build a list of return value registers.
	SmallVector<unsigned, 4> RetRegs;

	if (Ret->getNumOperands() > 0) {
	SmallVector<ISD::OutputArg, 4> Outs;
	GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL);

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ValLocs;
	CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	const Value *RV = Ret->getOperand(0);
	unsigned Reg = getRegForValue(RV);
	if (Reg == 0)
	return false;

	// Only handle a single return value for now.
	if (ValLocs.size() != 1)
	return false;

	CCValAssign &VA = ValLocs[0];

	// Don't bother handling odd stuff for now.
	if (VA.getLocInfo() != CCValAssign::Full)
	return false;
	// Only handle register returns for now.
	if (!VA.isRegLoc())
	return false;

	// The calling-convention tables for x87 returns don't tell
	// the whole story.
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;

	unsigned SrcReg = Reg + VA.getValNo();
	EVT SrcVT = TLI.getValueType(DL, RV->getType());
	EVT DstVT = VA.getValVT();
	// Special handling for extended integers.
	if (SrcVT != DstVT) {
	if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
	return false;

	if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
	return false;

	assert(DstVT == MVT::i32 && "X86 should always ext to i32");

	if (SrcVT == MVT::i1) {
	if (Outs[0].Flags.isSExt())
	return false;
	SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /TODO: Kill=/false);
	SrcVT = MVT::i8;
	}
	unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
	ISD::SIGN_EXTEND;
	SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
	SrcReg, /TODO: Kill=/false);
	}

	// Make the copy.
	Register DstReg = VA.getLocReg();
	const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
	// Avoid a cross-class copy. This is very unlikely.
	if (!SrcRC->contains(DstReg))
	return false;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);

	// Add register to return instruction.
	RetRegs.push_back(VA.getLocReg());
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
	unsigned Reg = X86MFInfo->getSRetReturnReg();
	assert(Reg &&
	"SRetReturnReg should have been set in LowerFormalArguments()!");
	unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
	RetRegs.push_back(RetReg);
	}

	// Now emit the RET.
	MachineInstrBuilder MIB;
	if (X86MFInfo->getBytesToPopOnReturn()) {
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
	.addImm(X86MFInfo->getBytesToPopOnReturn());
	} else {
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
	}
	for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
	MIB.addReg(RetRegs[i], RegState::Implicit);
	return true;
	}

	/// X86SelectLoad - Select and emit code to implement load instructions.
	///
	bool X86FastISel::X86SelectLoad(const Instruction *I) {
	const LoadInst *LI = cast<LoadInst>(I);

	// Atomic loads need special handling.
	if (LI->isAtomic())
	return false;

	const Value *SV = I->getOperand(0);
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(SV)) {
	if (Arg->hasSwiftErrorAttr())
	return false;
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
	if (Alloca->isSwiftError())
	return false;
	}
	}

	MVT VT;
	if (!isTypeLegal(LI->getType(), VT, /AllowI1=/true))
	return false;

	const Value *Ptr = LI->getPointerOperand();

	X86AddressMode AM;
	if (!X86SelectAddress(Ptr, AM))
	return false;

	unsigned Alignment = LI->getAlignment();
	unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = ABIAlignment;

	unsigned ResultReg = 0;
	if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
	Alignment))
	return false;

	updateValueMap(I, ResultReg);
	return true;
	}

	static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
	bool HasAVX512 = Subtarget->hasAVX512();
	bool HasAVX = Subtarget->hasAVX();
	bool X86ScalarSSEf32 = Subtarget->hasSSE1();
	bool X86ScalarSSEf64 = Subtarget->hasSSE2();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return 0;
	case MVT::i8: return X86::CMP8rr;
	case MVT::i16: return X86::CMP16rr;
	case MVT::i32: return X86::CMP32rr;
	case MVT::i64: return X86::CMP64rr;
	case MVT::f32:
	return X86ScalarSSEf32
	? (HasAVX512 ? X86::VUCOMISSZrr
	: HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
	: 0;
	case MVT::f64:
	return X86ScalarSSEf64
	? (HasAVX512 ? X86::VUCOMISDZrr
	: HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
	: 0;
	}
	}

	/// If we have a comparison with RHS as the RHS of the comparison, return an
	/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
	static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
	int64_t Val = RHSC->getSExtValue();
	switch (VT.getSimpleVT().SimpleTy) {
	// Otherwise, we can't fold the immediate into this comparison.
	default:
	return 0;
	case MVT::i8:
	return X86::CMP8ri;
	case MVT::i16:
	if (isInt<8>(Val))
	return X86::CMP16ri8;
	return X86::CMP16ri;
	case MVT::i32:
	if (isInt<8>(Val))
	return X86::CMP32ri8;
	return X86::CMP32ri;
	case MVT::i64:
	if (isInt<8>(Val))
	return X86::CMP64ri8;
	// 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
	// field.
	if (isInt<32>(Val))
	return X86::CMP64ri32;
	return 0;
	}
	}

	bool X86FastISel::X86FastEmitCompare(const Value Op0, const Value Op1, EVT VT,
	const DebugLoc &CurDbgLoc) {
	unsigned Op0Reg = getRegForValue(Op0);
	if (Op0Reg == 0) return false;

	// Handle 'null' like i32/i64 0.
	if (isa<ConstantPointerNull>(Op1))
	Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));

	// We have two options: compare with register or immediate. If the RHS of
	// the compare is an immediate that we can fold into this compare, use
	// CMPri, otherwise use CMPrr.
	if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
	if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
	.addReg(Op0Reg)
	.addImm(Op1C->getSExtValue());
	return true;
	}
	}

	unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
	if (CompareOpc == 0) return false;

	unsigned Op1Reg = getRegForValue(Op1);
	if (Op1Reg == 0) return false;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
	.addReg(Op0Reg)
	.addReg(Op1Reg);

	return true;
	}

	bool X86FastISel::X86SelectCmp(const Instruction *I) {
	const CmpInst *CI = cast<CmpInst>(I);

	MVT VT;
	if (!isTypeLegal(I->getOperand(0)->getType(), VT))
	return false;

	// Try to optimize or fold the cmp.
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
	unsigned ResultReg = 0;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_FALSE: {
	ResultReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
	ResultReg);
	ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /Kill=/true,
	X86::sub_8bit);
	if (!ResultReg)
	return false;
	break;
	}
	case CmpInst::FCMP_TRUE: {
	ResultReg = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
	ResultReg).addImm(1);
	break;
	}
	}

	if (ResultReg) {
	updateValueMap(I, ResultReg);
	return true;
	}

	const Value *LHS = CI->getOperand(0);
	const Value *RHS = CI->getOperand(1);

	// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
	// We don't have to materialize a zero constant for this case and can just use
	// %x again on the RHS.
	if (Predicate == CmpInst::FCMP_ORD \|\| Predicate == CmpInst::FCMP_UNO) {
	const auto *RHSC = dyn_cast<ConstantFP>(RHS);
	if (RHSC && RHSC->isNullValue())
	RHS = LHS;
	}

	// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
	static const uint16_t SETFOpcTable[2][3] = {
	{ X86::COND_E, X86::COND_NP, X86::AND8rr },
	{ X86::COND_NE, X86::COND_P, X86::OR8rr }
	};
	const uint16_t *SETFOpc = nullptr;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
	case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
	}

	ResultReg = createResultReg(&X86::GR8RegClass);
	if (SETFOpc) {
	if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
	return false;

	unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
	unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
	FlagReg1).addImm(SETFOpc[0]);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
	FlagReg2).addImm(SETFOpc[1]);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
	ResultReg).addReg(FlagReg1).addReg(FlagReg2);
	updateValueMap(I, ResultReg);
	return true;
	}

	X86::CondCode CC;
	bool SwapArgs;
	std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
	assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");

	if (SwapArgs)
	std::swap(LHS, RHS);

	// Emit a compare of LHS/RHS.
	if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
	return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
	ResultReg).addImm(CC);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectZExt(const Instruction *I) {
	EVT DstVT = TLI.getValueType(DL, I->getType());
	if (!TLI.isTypeLegal(DstVT))
	return false;

	unsigned ResultReg = getRegForValue(I->getOperand(0));
	if (ResultReg == 0)
	return false;

	// Handle zero-extension from i1 to i8, which is common.
	MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
	if (SrcVT == MVT::i1) {
	// Set the high bits to zero.
	ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /TODO: Kill=/false);
	SrcVT = MVT::i8;

	if (ResultReg == 0)
	return false;
	}

	if (DstVT == MVT::i64) {
	// Handle extension to 64-bits via sub-register shenanigans.
	unsigned MovInst;

	switch (SrcVT.SimpleTy) {
	case MVT::i8: MovInst = X86::MOVZX32rr8; break;
	case MVT::i16: MovInst = X86::MOVZX32rr16; break;
	case MVT::i32: MovInst = X86::MOV32rr; break;
	default: llvm_unreachable("Unexpected zext to i64 source type");
	}

	unsigned Result32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
	.addReg(ResultReg);

	ResultReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
	ResultReg)
	.addImm(0).addReg(Result32).addImm(X86::sub_32bit);
	} else if (DstVT == MVT::i16) {
	// i8->i16 doesn't exist in the autogenerated isel table. Need to zero
	// extend to 32-bits and then extract down to 16-bits.
	unsigned Result32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
	Result32).addReg(ResultReg);

	ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /Kill=/true,
	X86::sub_16bit);
	} else if (DstVT != MVT::i8) {
	ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
	ResultReg, /Kill=/true);
	if (ResultReg == 0)
	return false;
	}

	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectSExt(const Instruction *I) {
	EVT DstVT = TLI.getValueType(DL, I->getType());
	if (!TLI.isTypeLegal(DstVT))
	return false;

	unsigned ResultReg = getRegForValue(I->getOperand(0));
	if (ResultReg == 0)
	return false;

	// Handle sign-extension from i1 to i8.
	MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
	if (SrcVT == MVT::i1) {
	// Set the high bits to zero.
	unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
	/TODO: Kill=/false);
	if (ZExtReg == 0)
	return false;

	// Negate the result to make an 8-bit sign extended value.
	ResultReg = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r),
	ResultReg).addReg(ZExtReg);

	SrcVT = MVT::i8;
	}

	if (DstVT == MVT::i16) {
	// i8->i16 doesn't exist in the autogenerated isel table. Need to sign
	// extend to 32-bits and then extract down to 16-bits.
	unsigned Result32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
	Result32).addReg(ResultReg);

	ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /Kill=/true,
	X86::sub_16bit);
	} else if (DstVT != MVT::i8) {
	ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
	ResultReg, /Kill=/true);
	if (ResultReg == 0)
	return false;
	}

	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectBranch(const Instruction *I) {
	// Unconditional branches are selected by tablegen-generated code.
	// Handle a conditional branch.
	const BranchInst *BI = cast<BranchInst>(I);
	MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
	MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];

	// Fold the common case of a conditional branch with a comparison
	// in the same block (values defined on other blocks may not have
	// initialized registers).
	X86::CondCode CC;
	if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
	if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
	EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());

	// Try to optimize or fold the cmp.
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
	case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
	}

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);

	// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
	// 0.0.
	// We don't have to materialize a zero constant for this case and can just
	// use %x again on the RHS.
	if (Predicate == CmpInst::FCMP_ORD \|\| Predicate == CmpInst::FCMP_UNO) {
	const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
	if (CmpRHSC && CmpRHSC->isNullValue())
	CmpRHS = CmpLHS;
	}

	// Try to take advantage of fallthrough opportunities.
	if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
	std::swap(TrueMBB, FalseMBB);
	Predicate = CmpInst::getInversePredicate(Predicate);
	}

	// FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
	// code check. Instead two branch instructions are required to check all
	// the flags. First we change the predicate to a supported condition code,
	// which will be the first branch. Later one we will emit the second
	// branch.
	bool NeedExtraBranch = false;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_OEQ:
	std::swap(TrueMBB, FalseMBB);
	LLVM_FALLTHROUGH;
	case CmpInst::FCMP_UNE:
	NeedExtraBranch = true;
	Predicate = CmpInst::FCMP_ONE;
	break;
	}

	bool SwapArgs;
	std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
	assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");

	if (SwapArgs)
	std::swap(CmpLHS, CmpRHS);

	// Emit a compare of the LHS and RHS, setting the flags.
	if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
	return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
	.addMBB(TrueMBB).addImm(CC);

	// X86 requires a second branch to handle UNE (and OEQ, which is mapped
	// to UNE above).
	if (NeedExtraBranch) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
	.addMBB(TrueMBB).addImm(X86::COND_P);
	}

	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}
	} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
	// Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
	// typically happen for _Bool and C++ bools.
	MVT SourceVT;
	if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
	isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
	unsigned TestOpc = 0;
	switch (SourceVT.SimpleTy) {
	default: break;
	case MVT::i8: TestOpc = X86::TEST8ri; break;
	case MVT::i16: TestOpc = X86::TEST16ri; break;
	case MVT::i32: TestOpc = X86::TEST32ri; break;
	case MVT::i64: TestOpc = X86::TEST64ri32; break;
	}
	if (TestOpc) {
	unsigned OpReg = getRegForValue(TI->getOperand(0));
	if (OpReg == 0) return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
	.addReg(OpReg).addImm(1);

	unsigned JmpCond = X86::COND_NE;
	if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
	std::swap(TrueMBB, FalseMBB);
	JmpCond = X86::COND_E;
	}

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
	.addMBB(TrueMBB).addImm(JmpCond);

	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}
	}
	} else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
	// Fake request the condition, otherwise the intrinsic might be completely
	// optimized away.
	unsigned TmpReg = getRegForValue(BI->getCondition());
	if (TmpReg == 0)
	return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
	.addMBB(TrueMBB).addImm(CC);
	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}

	// Otherwise do a clumsy setcc and re-test it.
	// Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
	// in an explicit cast, so make sure to handle that correctly.
	unsigned OpReg = getRegForValue(BI->getCondition());
	if (OpReg == 0) return false;

	// In case OpReg is a K register, COPY to a GPR
	if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
	unsigned KOpReg = OpReg;
	OpReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), OpReg)
	.addReg(KOpReg);
	OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /Kill=/true,
	X86::sub_8bit);
	}
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
	.addReg(OpReg)
	.addImm(1);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
	.addMBB(TrueMBB).addImm(X86::COND_NE);
	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}

	bool X86FastISel::X86SelectShift(const Instruction *I) {
	unsigned CReg = 0, OpReg = 0;
	const TargetRegisterClass *RC = nullptr;
	if (I->getType()->isIntegerTy(8)) {
	CReg = X86::CL;
	RC = &X86::GR8RegClass;
	switch (I->getOpcode()) {
	case Instruction::LShr: OpReg = X86::SHR8rCL; break;
	case Instruction::AShr: OpReg = X86::SAR8rCL; break;
	case Instruction::Shl: OpReg = X86::SHL8rCL; break;
	default: return false;
	}
	} else if (I->getType()->isIntegerTy(16)) {
	CReg = X86::CX;
	RC = &X86::GR16RegClass;
	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected shift opcode");
	case Instruction::LShr: OpReg = X86::SHR16rCL; break;
	case Instruction::AShr: OpReg = X86::SAR16rCL; break;
	case Instruction::Shl: OpReg = X86::SHL16rCL; break;
	}
	} else if (I->getType()->isIntegerTy(32)) {
	CReg = X86::ECX;
	RC = &X86::GR32RegClass;
	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected shift opcode");
	case Instruction::LShr: OpReg = X86::SHR32rCL; break;
	case Instruction::AShr: OpReg = X86::SAR32rCL; break;
	case Instruction::Shl: OpReg = X86::SHL32rCL; break;
	}
	} else if (I->getType()->isIntegerTy(64)) {
	CReg = X86::RCX;
	RC = &X86::GR64RegClass;
	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected shift opcode");
	case Instruction::LShr: OpReg = X86::SHR64rCL; break;
	case Instruction::AShr: OpReg = X86::SAR64rCL; break;
	case Instruction::Shl: OpReg = X86::SHL64rCL; break;
	}
	} else {
	return false;
	}

	MVT VT;
	if (!isTypeLegal(I->getType(), VT))
	return false;

	unsigned Op0Reg = getRegForValue(I->getOperand(0));
	if (Op0Reg == 0) return false;

	unsigned Op1Reg = getRegForValue(I->getOperand(1));
	if (Op1Reg == 0) return false;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
	CReg).addReg(Op1Reg);

	// The shift instruction uses X86::CL. If we defined a super-register
	// of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
	if (CReg != X86::CL)
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::KILL), X86::CL)
	.addReg(CReg, RegState::Kill);

	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
	.addReg(Op0Reg);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectDivRem(const Instruction *I) {
	const static unsigned NumTypes = 4; // i8, i16, i32, i64
	const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
	const static bool S = true; // IsSigned
	const static bool U = false; // !IsSigned
	const static unsigned Copy = TargetOpcode::COPY;
	// For the X86 DIV/IDIV instruction, in most cases the dividend
	// (numerator) must be in a specific register pair highreg:lowreg,
	// producing the quotient in lowreg and the remainder in highreg.
	// For most data types, to set up the instruction, the dividend is
	// copied into lowreg, and lowreg is sign-extended or zero-extended
	// into highreg. The exception is i8, where the dividend is defined
	// as a single register rather than a register pair, and we
	// therefore directly sign-extend or zero-extend the dividend into
	// lowreg, instead of copying, and ignore the highreg.
	const static struct DivRemEntry {
	// The following portion depends only on the data type.
	const TargetRegisterClass *RC;
	unsigned LowInReg; // low part of the register pair
	unsigned HighInReg; // high part of the register pair
	// The following portion depends on both the data type and the operation.
	struct DivRemResult {
	unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
	unsigned OpSignExtend; // Opcode for sign-extending lowreg into
	// highreg, or copying a zero into highreg.
	unsigned OpCopy; // Opcode for copying dividend into lowreg, or
	// zero/sign-extending into lowreg for i8.
	unsigned DivRemResultReg; // Register containing the desired result.
	bool IsOpSigned; // Whether to use signed or unsigned form.
	} ResultTable[NumOps];
	} OpTable[NumTypes] = {
	{ &X86::GR8RegClass, X86::AX, 0, {
	{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
	{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
	{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
	{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
	}
	}, // i8
	{ &X86::GR16RegClass, X86::AX, X86::DX, {
	{ X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
	{ X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
	{ X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
	{ X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
	}
	}, // i16
	{ &X86::GR32RegClass, X86::EAX, X86::EDX, {
	{ X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
	{ X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
	{ X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
	{ X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
	}
	}, // i32
	{ &X86::GR64RegClass, X86::RAX, X86::RDX, {
	{ X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
	{ X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
	{ X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
	{ X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
	}
	}, // i64
	};

	MVT VT;
	if (!isTypeLegal(I->getType(), VT))
	return false;

	unsigned TypeIndex, OpIndex;
	switch (VT.SimpleTy) {
	default: return false;
	case MVT::i8: TypeIndex = 0; break;
	case MVT::i16: TypeIndex = 1; break;
	case MVT::i32: TypeIndex = 2; break;
	case MVT::i64: TypeIndex = 3;
	if (!Subtarget->is64Bit())
	return false;
	break;
	}

	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected div/rem opcode");
	case Instruction::SDiv: OpIndex = 0; break;
	case Instruction::SRem: OpIndex = 1; break;
	case Instruction::UDiv: OpIndex = 2; break;
	case Instruction::URem: OpIndex = 3; break;
	}

	const DivRemEntry &TypeEntry = OpTable[TypeIndex];
	const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
	unsigned Op0Reg = getRegForValue(I->getOperand(0));
	if (Op0Reg == 0)
	return false;
	unsigned Op1Reg = getRegForValue(I->getOperand(1));
	if (Op1Reg == 0)
	return false;

	// Move op0 into low-order input register.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
	// Zero-extend or sign-extend into high-order input register.
	if (OpEntry.OpSignExtend) {
	if (OpEntry.IsOpSigned)
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(OpEntry.OpSignExtend));
	else {
	unsigned Zero32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(X86::MOV32r0), Zero32);

	// Copy the zero into the appropriate sub/super/identical physical
	// register. Unfortunately the operations needed are not uniform enough
	// to fit neatly into the table above.
	if (VT == MVT::i16) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Copy), TypeEntry.HighInReg)
	.addReg(Zero32, 0, X86::sub_16bit);
	} else if (VT == MVT::i32) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Copy), TypeEntry.HighInReg)
	.addReg(Zero32);
	} else if (VT == MVT::i64) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
	.addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
	}
	}
	}
	// Generate the DIV/IDIV instruction.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
	// For i8 remainder, we can't reference ah directly, as we'll end
	// up with bogus copies like %r9b = COPY %ah. Reference ax
	// instead to prevent ah references in a rex instruction.
	//
	// The current assumption of the fast register allocator is that isel
	// won't generate explicit references to the GR8_NOREX registers. If
	// the allocator and/or the backend get enhanced to be more robust in
	// that regard, this can be, and should be, removed.
	unsigned ResultReg = 0;
	if ((I->getOpcode() == Instruction::SRem \|\|
	I->getOpcode() == Instruction::URem) &&
	OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
	unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
	unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Copy), SourceSuperReg).addReg(X86::AX);

	// Shift AX right by 8 bits instead of using AH.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
	ResultSuperReg).addReg(SourceSuperReg).addImm(8);

	// Now reference the 8-bit subreg of the result.
	ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
	/Kill=/true, X86::sub_8bit);
	}
	// Copy the result out of the physreg if we haven't already.
	if (!ResultReg) {
	ResultReg = createResultReg(TypeEntry.RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
	.addReg(OpEntry.DivRemResultReg);
	}
	updateValueMap(I, ResultReg);

	return true;
	}

	/// Emit a conditional move instruction (if the are supported) to lower
	/// the select.
	bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
	// Check if the subtarget supports these instructions.
	if (!Subtarget->hasCMov())
	return false;

	// FIXME: Add support for i8.
	if (RetVT < MVT::i16 \|\| RetVT > MVT::i64)
	return false;

	const Value *Cond = I->getOperand(0);
	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
	bool NeedTest = true;
	X86::CondCode CC = X86::COND_NE;

	// Optimize conditions coming from a compare if both instructions are in the
	// same basic block (values defined in other basic blocks may not have
	// initialized registers).
	const auto *CI = dyn_cast<CmpInst>(Cond);
	if (CI && (CI->getParent() == I->getParent())) {
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);

	// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
	static const uint16_t SETFOpcTable[2][3] = {
	{ X86::COND_NP, X86::COND_E, X86::TEST8rr },
	{ X86::COND_P, X86::COND_NE, X86::OR8rr }
	};
	const uint16_t *SETFOpc = nullptr;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_OEQ:
	SETFOpc = &SETFOpcTable[0][0];
	Predicate = CmpInst::ICMP_NE;
	break;
	case CmpInst::FCMP_UNE:
	SETFOpc = &SETFOpcTable[1][0];
	Predicate = CmpInst::ICMP_NE;
	break;
	}

	bool NeedSwap;
	std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
	assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);
	if (NeedSwap)
	std::swap(CmpLHS, CmpRHS);

	EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
	// Emit a compare of the LHS and RHS, setting the flags.
	if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
	return false;

	if (SETFOpc) {
	unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
	unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
	FlagReg1).addImm(SETFOpc[0]);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
	FlagReg2).addImm(SETFOpc[1]);
	auto const &II = TII.get(SETFOpc[2]);
	if (II.getNumDefs()) {
	unsigned TmpReg = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
	.addReg(FlagReg2).addReg(FlagReg1);
	} else {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
	.addReg(FlagReg2).addReg(FlagReg1);
	}
	}
	NeedTest = false;
	} else if (foldX86XALUIntrinsic(CC, I, Cond)) {
	// Fake request the condition, otherwise the intrinsic might be completely
	// optimized away.
	unsigned TmpReg = getRegForValue(Cond);
	if (TmpReg == 0)
	return false;

	NeedTest = false;
	}

	if (NeedTest) {
	// Selects operate on i1, however, CondReg is 8 bits width and may contain
	// garbage. Indeed, only the less significant bit is supposed to be
	// accurate. If we read more than the lsb, we may see non-zero values
	// whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
	// the select. This is achieved by performing TEST against 1.
	unsigned CondReg = getRegForValue(Cond);
	if (CondReg == 0)
	return false;
	bool CondIsKill = hasTrivialKill(Cond);

	// In case OpReg is a K register, COPY to a GPR
	if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
	unsigned KCondReg = CondReg;
	CondReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), CondReg)
	.addReg(KCondReg, getKillRegState(CondIsKill));
	CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /Kill=/true,
	X86::sub_8bit);
	}
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
	.addReg(CondReg, getKillRegState(CondIsKill))
	.addImm(1);
	}

	const Value *LHS = I->getOperand(1);
	const Value *RHS = I->getOperand(2);

	unsigned RHSReg = getRegForValue(RHS);
	bool RHSIsKill = hasTrivialKill(RHS);

	unsigned LHSReg = getRegForValue(LHS);
	bool LHSIsKill = hasTrivialKill(LHS);

	if (!LHSReg \|\| !RHSReg)
	return false;

	const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
	unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
	unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
	LHSReg, LHSIsKill, CC);
	updateValueMap(I, ResultReg);
	return true;
	}

	/// Emit SSE or AVX instructions to lower the select.
	///
	/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
	/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
	/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
	bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
	// Optimize conditions coming from a compare if both instructions are in the
	// same basic block (values defined in other basic blocks may not have
	// initialized registers).
	const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
	if (!CI \|\| (CI->getParent() != I->getParent()))
	return false;

	if (I->getType() != CI->getOperand(0)->getType() \|\|
	!((Subtarget->hasSSE1() && RetVT == MVT::f32) \|\|
	(Subtarget->hasSSE2() && RetVT == MVT::f64)))
	return false;

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);

	// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
	// We don't have to materialize a zero constant for this case and can just use
	// %x again on the RHS.
	if (Predicate == CmpInst::FCMP_ORD \|\| Predicate == CmpInst::FCMP_UNO) {
	const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
	if (CmpRHSC && CmpRHSC->isNullValue())
	CmpRHS = CmpLHS;
	}

	unsigned CC;
	bool NeedSwap;
	std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
	if (CC > 7 && !Subtarget->hasAVX())
	return false;

	if (NeedSwap)
	std::swap(CmpLHS, CmpRHS);

	const Value *LHS = I->getOperand(1);
	const Value *RHS = I->getOperand(2);

	unsigned LHSReg = getRegForValue(LHS);
	bool LHSIsKill = hasTrivialKill(LHS);

	unsigned RHSReg = getRegForValue(RHS);
	bool RHSIsKill = hasTrivialKill(RHS);

	unsigned CmpLHSReg = getRegForValue(CmpLHS);
	bool CmpLHSIsKill = hasTrivialKill(CmpLHS);

	unsigned CmpRHSReg = getRegForValue(CmpRHS);
	bool CmpRHSIsKill = hasTrivialKill(CmpRHS);

	if (!LHSReg \|\| !RHSReg \|\| !CmpLHS \|\| !CmpRHS)
	return false;

	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
	unsigned ResultReg;

	if (Subtarget->hasAVX512()) {
	// If we have AVX512 we can use a mask compare and masked movss/sd.
	const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
	const TargetRegisterClass *VK1 = &X86::VK1RegClass;

	unsigned CmpOpcode =
	(RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
	unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
	CmpRHSReg, CmpRHSIsKill, CC);

	// Need an IMPLICIT_DEF for the input that is used to generate the upper
	// bits of the result register since its not based on any of the inputs.
	unsigned ImplicitDefReg = createResultReg(VR128X);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);

	// Place RHSReg is the passthru of the masked movss/sd operation and put
	// LHS in the input. The mask input comes from the compare.
	unsigned MovOpcode =
	(RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
	unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
	CmpReg, true, ImplicitDefReg, true,
	LHSReg, LHSIsKill);

	ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);

	} else if (Subtarget->hasAVX()) {
	const TargetRegisterClass *VR128 = &X86::VR128RegClass;

	// If we have AVX, create 1 blendv instead of 3 logic instructions.
	// Blendv was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	unsigned CmpOpcode =
	(RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
	unsigned BlendOpcode =
	(RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;

	unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
	CmpRHSReg, CmpRHSIsKill, CC);
	unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
	LHSReg, LHSIsKill, CmpReg, true);
	ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
	} else {
	// Choose the SSE instruction sequence based on data type (float or double).
	static const uint16_t OpcTable[2][4] = {
	{ X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
	{ X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
	};

	const uint16_t *Opc = nullptr;
	switch (RetVT.SimpleTy) {
	default: return false;
	case MVT::f32: Opc = &OpcTable[0][0]; break;
	case MVT::f64: Opc = &OpcTable[1][0]; break;
	}

	const TargetRegisterClass *VR128 = &X86::VR128RegClass;
	unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
	CmpRHSReg, CmpRHSIsKill, CC);
	unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /IsKill=/false,
	LHSReg, LHSIsKill);
	unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /IsKill=/true,
	RHSReg, RHSIsKill);
	unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /IsKill=/true,
	AndReg, /IsKill=/true);
	ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
	}
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
	// These are pseudo CMOV instructions and will be later expanded into control-
	// flow.
	unsigned Opc;
	switch (RetVT.SimpleTy) {
	default: return false;
	case MVT::i8: Opc = X86::CMOV_GR8; break;
	case MVT::i16: Opc = X86::CMOV_GR16; break;
	case MVT::i32: Opc = X86::CMOV_GR32; break;
	case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
	: X86::CMOV_FR32; break;
	case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
	: X86::CMOV_FR64; break;
	}

	const Value *Cond = I->getOperand(0);
	X86::CondCode CC = X86::COND_NE;

	// Optimize conditions coming from a compare if both instructions are in the
	// same basic block (values defined in other basic blocks may not have
	// initialized registers).
	const auto *CI = dyn_cast<CmpInst>(Cond);
	if (CI && (CI->getParent() == I->getParent())) {
	bool NeedSwap;
	std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
	if (CC > X86::LAST_VALID_COND)
	return false;

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);

	if (NeedSwap)
	std::swap(CmpLHS, CmpRHS);

	EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
	if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
	return false;
	} else {
	unsigned CondReg = getRegForValue(Cond);
	if (CondReg == 0)
	return false;
	bool CondIsKill = hasTrivialKill(Cond);

	// In case OpReg is a K register, COPY to a GPR
	if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
	unsigned KCondReg = CondReg;
	CondReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), CondReg)
	.addReg(KCondReg, getKillRegState(CondIsKill));
	CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /Kill=/true,
	X86::sub_8bit);
	}
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
	.addReg(CondReg, getKillRegState(CondIsKill))
	.addImm(1);
	}

	const Value *LHS = I->getOperand(1);
	const Value *RHS = I->getOperand(2);

	unsigned LHSReg = getRegForValue(LHS);
	bool LHSIsKill = hasTrivialKill(LHS);

	unsigned RHSReg = getRegForValue(RHS);
	bool RHSIsKill = hasTrivialKill(RHS);

	if (!LHSReg \|\| !RHSReg)
	return false;

	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);

	unsigned ResultReg =
	fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectSelect(const Instruction *I) {
	MVT RetVT;
	if (!isTypeLegal(I->getType(), RetVT))
	return false;

	// Check if we can fold the select.
	if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
	const Value *Opnd = nullptr;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
	case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
	}
	// No need for a select anymore - this is an unconditional move.
	if (Opnd) {
	unsigned OpReg = getRegForValue(Opnd);
	if (OpReg == 0)
	return false;
	bool OpIsKill = hasTrivialKill(Opnd);
	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg)
	.addReg(OpReg, getKillRegState(OpIsKill));
	updateValueMap(I, ResultReg);
	return true;
	}
	}

	// First try to use real conditional move instructions.
	if (X86FastEmitCMoveSelect(RetVT, I))
	return true;

	// Try to use a sequence of SSE instructions to simulate a conditional move.
	if (X86FastEmitSSESelect(RetVT, I))
	return true;

	// Fall-back to pseudo conditional move instructions, which will be later
	// converted to control-flow.
	if (X86FastEmitPseudoSelect(RetVT, I))
	return true;

	return false;
	}

	// Common code for X86SelectSIToFP and X86SelectUIToFP.
	bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
	// The target-independent selection algorithm in FastISel already knows how
	// to select a SINT_TO_FP if the target is SSE but not AVX.
	// Early exit if the subtarget doesn't have AVX.
	// Unsigned conversion requires avx512.
	bool HasAVX512 = Subtarget->hasAVX512();
	if (!Subtarget->hasAVX() \|\| (!IsSigned && !HasAVX512))
	return false;

	// TODO: We could sign extend narrower types.
	MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
	if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
	return false;

	// Select integer to float/double conversion.
	unsigned OpReg = getRegForValue(I->getOperand(0));
	if (OpReg == 0)
	return false;

	unsigned Opcode;

	static const uint16_t SCvtOpc[2][2][2] = {
	{ { X86::VCVTSI2SSrr, X86::VCVTSI642SSrr },
	{ X86::VCVTSI2SDrr, X86::VCVTSI642SDrr } },
	{ { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
	{ X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
	};
	static const uint16_t UCvtOpc[2][2] = {
	{ X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
	{ X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
	};
	bool Is64Bit = SrcVT == MVT::i64;

	if (I->getType()->isDoubleTy()) {
	// s/uitofp int -> double
	Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
	} else if (I->getType()->isFloatTy()) {
	// s/uitofp int -> float
	Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
	} else
	return false;

	MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
	const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
	unsigned ImplicitDefReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
	unsigned ResultReg =
	fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
	return X86SelectIntToFP(I, /IsSigned/true);
	}

	bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
	return X86SelectIntToFP(I, /IsSigned/false);
	}

	// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
	bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
	unsigned TargetOpc,
	const TargetRegisterClass *RC) {
	assert((I->getOpcode() == Instruction::FPExt \|\|
	I->getOpcode() == Instruction::FPTrunc) &&
	"Instruction must be an FPExt or FPTrunc!");
	bool HasAVX = Subtarget->hasAVX();

	unsigned OpReg = getRegForValue(I->getOperand(0));
	if (OpReg == 0)
	return false;

	unsigned ImplicitDefReg;
	if (HasAVX) {
	ImplicitDefReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);

	}

	unsigned ResultReg = createResultReg(RC);
	MachineInstrBuilder MIB;
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
	ResultReg);

	if (HasAVX)
	MIB.addReg(ImplicitDefReg);

	MIB.addReg(OpReg);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectFPExt(const Instruction *I) {
	if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
	I->getOperand(0)->getType()->isFloatTy()) {
	bool HasAVX512 = Subtarget->hasAVX512();
	// fpext from float to double.
	unsigned Opc =
	HasAVX512 ? X86::VCVTSS2SDZrr
	: Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
	return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
	}

	return false;
	}

	bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
	if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
	I->getOperand(0)->getType()->isDoubleTy()) {
	bool HasAVX512 = Subtarget->hasAVX512();
	// fptrunc from double to float.
	unsigned Opc =
	HasAVX512 ? X86::VCVTSD2SSZrr
	: Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
	return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
	}

	return false;
	}

	bool X86FastISel::X86SelectTrunc(const Instruction *I) {
	EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, I->getType());

	// This code only handles truncation to byte.
	if (DstVT != MVT::i8 && DstVT != MVT::i1)
	return false;
	if (!TLI.isTypeLegal(SrcVT))
	return false;

	unsigned InputReg = getRegForValue(I->getOperand(0));
	if (!InputReg)
	// Unhandled operand. Halt "fast" selection and bail.
	return false;

	if (SrcVT == MVT::i8) {
	// Truncate from i8 to i1; no code needed.
	updateValueMap(I, InputReg);
	return true;
	}

	// Issue an extract_subreg.
	unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
	InputReg, false,
	X86::sub_8bit);
	if (!ResultReg)
	return false;

	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::IsMemcpySmall(uint64_t Len) {
	return Len <= (Subtarget->is64Bit() ? 32 : 16);
	}

	bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
	X86AddressMode SrcAM, uint64_t Len) {

	// Make sure we don't bloat code by inlining very large memcpy's.
	if (!IsMemcpySmall(Len))
	return false;

	bool i64Legal = Subtarget->is64Bit();

	// We don't care about alignment here since we just emit integer accesses.
	while (Len) {
	MVT VT;
	if (Len >= 8 && i64Legal)
	VT = MVT::i64;
	else if (Len >= 4)
	VT = MVT::i32;
	else if (Len >= 2)
	VT = MVT::i16;
	else
	VT = MVT::i8;

	unsigned Reg;
	bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
	RV &= X86FastEmitStore(VT, Reg, /Kill=/true, DestAM);
	assert(RV && "Failed to emit load or store??");

	unsigned Size = VT.getSizeInBits()/8;
	Len -= Size;
	DestAM.Disp += Size;
	SrcAM.Disp += Size;
	}

	return true;
	}

	bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
	// FIXME: Handle more intrinsics.
	switch (II->getIntrinsicID()) {
	default: return false;
	case Intrinsic::convert_from_fp16:
	case Intrinsic::convert_to_fp16: {
	if (Subtarget->useSoftFloat() \|\| !Subtarget->hasF16C())
	return false;

	const Value *Op = II->getArgOperand(0);
	unsigned InputReg = getRegForValue(Op);
	if (InputReg == 0)
	return false;

	// F16C only allows converting from float to half and from half to float.
	bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
	if (IsFloatToHalf) {
	if (!Op->getType()->isFloatTy())
	return false;
	} else {
	if (!II->getType()->isFloatTy())
	return false;
	}

	unsigned ResultReg = 0;
	const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
	if (IsFloatToHalf) {
	// 'InputReg' is implicitly promoted from register class FR32 to
	// register class VR128 by method 'constrainOperandRegClass' which is
	// directly called by 'fastEmitInst_ri'.
	// Instruction VCVTPS2PHrr takes an extra immediate operand which is
	// used to provide rounding control: use MXCSR.RC, encoded as 0b100.
	// It's consistent with the other FP instructions, which are usually
	// controlled by MXCSR.
	InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);

	// Move the lower 32-bits of ResultReg to another register of class GR32.
	ResultReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(X86::VMOVPDI2DIrr), ResultReg)
	.addReg(InputReg, RegState::Kill);

	// The result value is in the lower 16-bits of ResultReg.
	unsigned RegIdx = X86::sub_16bit;
	ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
	} else {
	assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
	// Explicitly sign-extend the input to 32-bit.
	InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
	/Kill=/false);

	// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
	InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
	InputReg, /Kill=/true);

	InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /Kill=/true);

	// The result value is in the lower 32-bits of ResultReg.
	// Emit an explicit copy from register class VR128 to register class FR32.
	ResultReg = createResultReg(&X86::FR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg)
	.addReg(InputReg, RegState::Kill);
	}

	updateValueMap(II, ResultReg);
	return true;
	}
	case Intrinsic::frameaddress: {
	MachineFunction *MF = FuncInfo.MF;
	if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
	return false;

	Type *RetTy = II->getCalledFunction()->getReturnType();

	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	unsigned Opc;
	const TargetRegisterClass *RC = nullptr;

	switch (VT.SimpleTy) {
	default: llvm_unreachable("Invalid result type for frameaddress.");
	case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
	case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
	}

	// This needs to be set before we call getPtrSizedFrameRegister, otherwise
	// we get the wrong frame register.
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
	unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");

	// Always make a copy of the frame register to a vreg first, so that we
	// never directly reference the frame register (the TwoAddressInstruction-
	// Pass doesn't like that).
	unsigned SrcReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);

	// Now recursively load from the frame address.
	// movq (%rbp), %rax
	// movq (%rax), %rax
	// movq (%rax), %rax
	// ...
	unsigned DestReg;
	unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
	while (Depth--) {
	DestReg = createResultReg(RC);
	addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), DestReg), SrcReg);
	SrcReg = DestReg;
	}

	updateValueMap(II, SrcReg);
	return true;
	}
	case Intrinsic::memcpy: {
	const MemCpyInst *MCI = cast<MemCpyInst>(II);
	// Don't handle volatile or variable length memcpys.
	if (MCI->isVolatile())
	return false;

	if (isa<ConstantInt>(MCI->getLength())) {
	// Small memcpy's are common enough that we want to do them
	// without a call if possible.
	uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
	if (IsMemcpySmall(Len)) {
	X86AddressMode DestAM, SrcAM;
	if (!X86SelectAddress(MCI->getRawDest(), DestAM) \|\|
	!X86SelectAddress(MCI->getRawSource(), SrcAM))
	return false;
	TryEmitSmallMemcpy(DestAM, SrcAM, Len);
	return true;
	}
	}

	unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
	if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
	return false;

	if (MCI->getSourceAddressSpace() > 255 \|\| MCI->getDestAddressSpace() > 255)
	return false;

	return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1);
	}
	case Intrinsic::memset: {
	const MemSetInst *MSI = cast<MemSetInst>(II);

	if (MSI->isVolatile())
	return false;

	unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
	if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
	return false;

	if (MSI->getDestAddressSpace() > 255)
	return false;

	return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
	}
	case Intrinsic::stackprotector: {
	// Emit code to store the stack guard onto the stack.
	EVT PtrTy = TLI.getPointerTy(DL);

	const Value *Op1 = II->getArgOperand(0); // The guard's value.
	const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));

	MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);

	// Grab the frame index.
	X86AddressMode AM;
	if (!X86SelectAddress(Slot, AM)) return false;
	if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
	return true;
	}
	case Intrinsic::dbg_declare: {
	const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
	X86AddressMode AM;
	assert(DI->getAddress() && "Null address should be checked earlier!");
	if (!X86SelectAddress(DI->getAddress(), AM))
	return false;
	const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
	// FIXME may need to add RegState::Debug to any registers produced,
	// although ESP/EBP should be the only ones at the moment.
	assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
	"Expected inlined-at fields to agree");
	addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
	.addImm(0)
	.addMetadata(DI->getVariable())
	.addMetadata(DI->getExpression());
	return true;
	}
	case Intrinsic::trap: {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
	return true;
	}
	case Intrinsic::sqrt: {
	if (!Subtarget->hasSSE1())
	return false;

	Type *RetTy = II->getCalledFunction()->getReturnType();

	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	// Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
	// is not generated by FastISel yet.
	// FIXME: Update this code once tablegen can handle it.
	static const uint16_t SqrtOpc[3][2] = {
	{ X86::SQRTSSr, X86::SQRTSDr },
	{ X86::VSQRTSSr, X86::VSQRTSDr },
	{ X86::VSQRTSSZr, X86::VSQRTSDZr },
	};
	unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
	Subtarget->hasAVX() ? 1 :
	0;
	unsigned Opc;
	switch (VT.SimpleTy) {
	default: return false;
	case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
	case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
	}

	const Value *SrcVal = II->getArgOperand(0);
	unsigned SrcReg = getRegForValue(SrcVal);

	if (SrcReg == 0)
	return false;

	const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
	unsigned ImplicitDefReg = 0;
	if (AVXLevel > 0) {
	ImplicitDefReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
	}

	unsigned ResultReg = createResultReg(RC);
	MachineInstrBuilder MIB;
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
	ResultReg);

	if (ImplicitDefReg)
	MIB.addReg(ImplicitDefReg);

	MIB.addReg(SrcReg);

	updateValueMap(II, ResultReg);
	return true;
	}
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::usub_with_overflow:
	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow: {
	// This implements the basic lowering of the xalu with overflow intrinsics
	// into add/sub/mul followed by either seto or setb.
	const Function *Callee = II->getCalledFunction();
	auto *Ty = cast<StructType>(Callee->getReturnType());
	Type *RetTy = Ty->getTypeAtIndex(0U);
	assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
	Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
	"Overflow value expected to be an i1");

	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	if (VT < MVT::i8 \|\| VT > MVT::i64)
	return false;

	const Value *LHS = II->getArgOperand(0);
	const Value *RHS = II->getArgOperand(1);

	// Canonicalize immediate to the RHS.
	if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
	isCommutativeIntrinsic(II))
	std::swap(LHS, RHS);

	unsigned BaseOpc, CondCode;
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::sadd_with_overflow:
	BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
	case Intrinsic::uadd_with_overflow:
	BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
	case Intrinsic::ssub_with_overflow:
	BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
	case Intrinsic::usub_with_overflow:
	BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
	case Intrinsic::smul_with_overflow:
	BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
	case Intrinsic::umul_with_overflow:
	BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
	}

	unsigned LHSReg = getRegForValue(LHS);
	if (LHSReg == 0)
	return false;
	bool LHSIsKill = hasTrivialKill(LHS);

	unsigned ResultReg = 0;
	// Check if we have an immediate version.
	if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
	static const uint16_t Opc[2][4] = {
	{ X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
	{ X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
	};

	if (CI->isOne() && (BaseOpc == ISD::ADD \|\| BaseOpc == ISD::SUB) &&
	CondCode == X86::COND_O) {
	// We can use INC/DEC.
	ResultReg = createResultReg(TLI.getRegClassFor(VT));
	bool IsDec = BaseOpc == ISD::SUB;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
	.addReg(LHSReg, getKillRegState(LHSIsKill));
	} else
	ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
	CI->getZExtValue());
	}

	unsigned RHSReg;
	bool RHSIsKill;
	if (!ResultReg) {
	RHSReg = getRegForValue(RHS);
	if (RHSReg == 0)
	return false;
	RHSIsKill = hasTrivialKill(RHS);
	ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
	RHSIsKill);
	}

	// FastISel doesn't have a pattern for all X86::MULr and X86::IMULr. Emit
	// it manually.
	if (BaseOpc == X86ISD::UMUL && !ResultReg) {
	static const uint16_t MULOpc[] =
	{ X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
	static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
	// First copy the first operand into RAX, which is an implicit input to
	// the X86::MUL*r instruction.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
	.addReg(LHSReg, getKillRegState(LHSIsKill));
	ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
	TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
	} else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
	static const uint16_t MULOpc[] =
	{ X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
	if (VT == MVT::i8) {
	// Copy the first operand into AL, which is an implicit input to the
	// X86::IMUL8r instruction.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), X86::AL)
	.addReg(LHSReg, getKillRegState(LHSIsKill));
	ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
	RHSIsKill);
	} else
	ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
	TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
	RHSReg, RHSIsKill);
	}

	if (!ResultReg)
	return false;

	// Assign to a GPR since the overflow return value is lowered to a SETcc.
	unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
	assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
	ResultReg2).addImm(CondCode);

	updateValueMap(II, ResultReg, 2);
	return true;
	}
	case Intrinsic::x86_sse_cvttss2si:
	case Intrinsic::x86_sse_cvttss2si64:
	case Intrinsic::x86_sse2_cvttsd2si:
	case Intrinsic::x86_sse2_cvttsd2si64: {
	bool IsInputDouble;
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic.");
	case Intrinsic::x86_sse_cvttss2si:
	case Intrinsic::x86_sse_cvttss2si64:
	if (!Subtarget->hasSSE1())
	return false;
	IsInputDouble = false;
	break;
	case Intrinsic::x86_sse2_cvttsd2si:
	case Intrinsic::x86_sse2_cvttsd2si64:
	if (!Subtarget->hasSSE2())
	return false;
	IsInputDouble = true;
	break;
	}

	Type *RetTy = II->getCalledFunction()->getReturnType();
	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	static const uint16_t CvtOpc[3][2][2] = {
	{ { X86::CVTTSS2SIrr, X86::CVTTSS2SI64rr },
	{ X86::CVTTSD2SIrr, X86::CVTTSD2SI64rr } },
	{ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SI64rr },
	{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SI64rr } },
	{ { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
	{ X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
	};
	unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
	Subtarget->hasAVX() ? 1 :
	0;
	unsigned Opc;
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected result type.");
	case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
	case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
	}

	// Check if we can fold insertelement instructions into the convert.
	const Value *Op = II->getArgOperand(0);
	while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
	const Value *Index = IE->getOperand(2);
	if (!isa<ConstantInt>(Index))
	break;
	unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();

	if (Idx == 0) {
	Op = IE->getOperand(1);
	break;
	}
	Op = IE->getOperand(0);
	}

	unsigned Reg = getRegForValue(Op);
	if (Reg == 0)
	return false;

	unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
	.addReg(Reg);

	updateValueMap(II, ResultReg);
	return true;
	}
	}
	}

	bool X86FastISel::fastLowerArguments() {
	if (!FuncInfo.CanLowerReturn)
	return false;

	const Function *F = FuncInfo.Fn;
	if (F->isVarArg())
	return false;

	CallingConv::ID CC = F->getCallingConv();
	if (CC != CallingConv::C)
	return false;

	if (Subtarget->isCallingConvWin64(CC))
	return false;

	if (!Subtarget->is64Bit())
	return false;

	if (Subtarget->useSoftFloat())
	return false;

	// Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
	unsigned GPRCnt = 0;
	unsigned FPRCnt = 0;
	for (auto const &Arg : F->args()) {
	if (Arg.hasAttribute(Attribute::ByVal) \|\|
	Arg.hasAttribute(Attribute::InReg) \|\|
	Arg.hasAttribute(Attribute::StructRet) \|\|
	Arg.hasAttribute(Attribute::SwiftSelf) \|\|
	Arg.hasAttribute(Attribute::SwiftError) \|\|
	Arg.hasAttribute(Attribute::Nest))
	return false;

	Type *ArgTy = Arg.getType();
	if (ArgTy->isStructTy() \|\| ArgTy->isArrayTy() \|\| ArgTy->isVectorTy())
	return false;

	EVT ArgVT = TLI.getValueType(DL, ArgTy);
	if (!ArgVT.isSimple()) return false;
	switch (ArgVT.getSimpleVT().SimpleTy) {
	default: return false;
	case MVT::i32:
	case MVT::i64:
	++GPRCnt;
	break;
	case MVT::f32:
	case MVT::f64:
	if (!Subtarget->hasSSE1())
	return false;
	++FPRCnt;
	break;
	}

	if (GPRCnt > 6)
	return false;

	if (FPRCnt > 8)
	return false;
	}

	static const MCPhysReg GPR32ArgRegs[] = {
	X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
	};
	static const MCPhysReg GPR64ArgRegs[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
	};
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};

	unsigned GPRIdx = 0;
	unsigned FPRIdx = 0;
	for (auto const &Arg : F->args()) {
	MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
	const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
	unsigned SrcReg;
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected value type.");
	case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
	case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
	case MVT::f32: LLVM_FALLTHROUGH;
	case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
	}
	unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
	// FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
	// Without this, EmitLiveInCopies may eliminate the livein if its only
	// use is a bitcast (which isn't turned into an instruction).
	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg)
	.addReg(DstReg, getKillRegState(true));
	updateValueMap(&Arg, ResultReg);
	}
	return true;
	}

	static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
	CallingConv::ID CC,
	ImmutableCallSite *CS) {
	if (Subtarget->is64Bit())
	return 0;
	if (Subtarget->getTargetTriple().isOSMSVCRT())
	return 0;
	if (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::HiPE \|\| CC == CallingConv::Tail)
	return 0;

	if (CS)
	if (CS->arg_empty() \|\| !CS->paramHasAttr(0, Attribute::StructRet) \|\|
	CS->paramHasAttr(0, Attribute::InReg) \|\| Subtarget->isTargetMCU())
	return 0;

	return 4;
	}

	bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
	auto &OutVals = CLI.OutVals;
	auto &OutFlags = CLI.OutFlags;
	auto &OutRegs = CLI.OutRegs;
	auto &Ins = CLI.Ins;
	auto &InRegs = CLI.InRegs;
	CallingConv::ID CC = CLI.CallConv;
	bool &IsTailCall = CLI.IsTailCall;
	bool IsVarArg = CLI.IsVarArg;
	const Value *Callee = CLI.Callee;
	MCSymbol *Symbol = CLI.Symbol;

	bool Is64Bit = Subtarget->is64Bit();
	bool IsWin64 = Subtarget->isCallingConvWin64(CC);

	const CallInst *CI =
	CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
	const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;

	// Call / invoke instructions with NoCfCheck attribute require special
	// handling.
	const auto *II =
	CLI.CS ? dyn_cast<InvokeInst>(CLI.CS->getInstruction()) : nullptr;
	if ((CI && CI->doesNoCfCheck()) \|\| (II && II->doesNoCfCheck()))
	return false;

	// Functions with no_caller_saved_registers that need special handling.
	if ((CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
	return false;

	- // Functions using retpoline for indirect calls need to use SDISel.
	- if (Subtarget->useRetpolineIndirectCalls())
	+ // Functions using thunks for indirect calls need to use SDISel.
	+ if (Subtarget->useIndirectThunkCalls())
	return false;

	// Handle only C, fastcc, and webkit_js calling conventions for now.
	switch (CC) {
	default: return false;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::Tail:
	case CallingConv::WebKit_JS:
	case CallingConv::Swift:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	case CallingConv::CFGuard_Check:
	break;
	}

	// Allow SelectionDAG isel to handle tail calls.
	if (IsTailCall)
	return false;

	// fastcc with -tailcallopt is intended to provide a guaranteed
	// tail call optimization. Fastisel doesn't know how to do that.
	if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) \|\|
	CC == CallingConv::Tail)
	return false;

	// Don't know how to handle Win64 varargs yet. Nothing special needed for
	// x86-32. Special handling for x86-64 is implemented.
	if (IsVarArg && IsWin64)
	return false;

	// Don't know about inalloca yet.
	if (CLI.CS && CLI.CS->hasInAllocaArgument())
	return false;

	for (auto Flag : CLI.OutFlags)
	if (Flag.isSwiftError())
	return false;

	SmallVector<MVT, 16> OutVTs;
	SmallVector<unsigned, 16> ArgRegs;

	// If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
	// instruction. This is safe because it is common to all FastISel supported
	// calling conventions on x86.
	for (int i = 0, e = OutVals.size(); i != e; ++i) {
	Value *&Val = OutVals[i];
	ISD::ArgFlagsTy Flags = OutFlags[i];
	if (auto *CI = dyn_cast<ConstantInt>(Val)) {
	if (CI->getBitWidth() < 32) {
	if (Flags.isSExt())
	Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
	else
	Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
	}
	}

	// Passing bools around ends up doing a trunc to i1 and passing it.
	// Codegen this as an argument + "and 1".
	MVT VT;
	auto *TI = dyn_cast<TruncInst>(Val);
	unsigned ResultReg;
	if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
	(TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
	TI->hasOneUse()) {
	Value *PrevVal = TI->getOperand(0);
	ResultReg = getRegForValue(PrevVal);

	if (!ResultReg)
	return false;

	if (!isTypeLegal(PrevVal->getType(), VT))
	return false;

	ResultReg =
	fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
	} else {
	if (!isTypeLegal(Val->getType(), VT))
	return false;
	ResultReg = getRegForValue(Val);
	}

	if (!ResultReg)
	return false;

	ArgRegs.push_back(ResultReg);
	OutVTs.push_back(VT);
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());

	// Allocate shadow area for Win64
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

	// Issue CALLSEQ_START
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
	.addImm(NumBytes).addImm(0).addImm(0);

	// Walk the register/memloc assignments, inserting copies/loads.
	const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign const &VA = ArgLocs[i];
	const Value *ArgVal = OutVals[VA.getValNo()];
	MVT ArgVT = OutVTs[VA.getValNo()];

	if (ArgVT == MVT::x86mmx)
	return false;

	unsigned ArgReg = ArgRegs[VA.getValNo()];

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	case CCValAssign::Full: break;
	case CCValAssign::SExt: {
	assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
	"Unexpected extend");

	if (ArgVT == MVT::i1)
	return false;

	bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::ZExt: {
	assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
	"Unexpected extend");

	// Handle zero-extension from i1 to i8, which is common.
	if (ArgVT == MVT::i1) {
	// Set the high bits to zero.
	ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /TODO: Kill=/false);
	ArgVT = MVT::i8;

	if (ArgReg == 0)
	return false;
	}

	bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::AExt: {
	assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
	"Unexpected extend");
	bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	if (!Emitted)
	Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	if (!Emitted)
	Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);

	assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::BCvt: {
	ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
	/TODO: Kill=/false);
	assert(ArgReg && "Failed to emit a bitcast!");
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::VExt:
	// VExt has not been implemented, so this should be impossible to reach
	// for now. However, fallback to Selection DAG isel once implemented.
	return false;
	case CCValAssign::AExtUpper:
	case CCValAssign::SExtUpper:
	case CCValAssign::ZExtUpper:
	case CCValAssign::FPExt:
	case CCValAssign::Trunc:
	llvm_unreachable("Unexpected loc info!");
	case CCValAssign::Indirect:
	// FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
	// support this.
	return false;
	}

	if (VA.isRegLoc()) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
	OutRegs.push_back(VA.getLocReg());
	} else {
	assert(VA.isMemLoc());

	// Don't emit stores for undef values.
	if (isa<UndefValue>(ArgVal))
	continue;

	unsigned LocMemOffset = VA.getLocMemOffset();
	X86AddressMode AM;
	AM.Base.Reg = RegInfo->getStackRegister();
	AM.Disp = LocMemOffset;
	ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
	unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
	MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
	MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
	MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
	if (Flags.isByVal()) {
	X86AddressMode SrcAM;
	SrcAM.Base.Reg = ArgReg;
	if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
	return false;
	} else if (isa<ConstantInt>(ArgVal) \|\| isa<ConstantPointerNull>(ArgVal)) {
	// If this is a really simple value, emit this with the Value* version
	// of X86FastEmitStore. If it isn't simple, we don't want to do this,
	// as it can cause us to reevaluate the argument.
	if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
	return false;
	} else {
	bool ValIsKill = hasTrivialKill(ArgVal);
	if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
	return false;
	}
	}
	}

	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (Subtarget->isPICStyleGOT()) {
	unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
	}

	if (Is64Bit && IsVarArg && !IsWin64) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget->hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
	X86::AL).addImm(NumXMMRegs);
	}

	// Materialize callee address in a register. FIXME: GV address can be
	// handled with a CALLpcrel32 instead.
	X86AddressMode CalleeAM;
	if (!X86SelectCallAddress(Callee, CalleeAM))
	return false;

	unsigned CalleeOp = 0;
	const GlobalValue *GV = nullptr;
	if (CalleeAM.GV != nullptr) {
	GV = CalleeAM.GV;
	} else if (CalleeAM.Base.Reg != 0) {
	CalleeOp = CalleeAM.Base.Reg;
	} else
	return false;

	// Issue the call.
	MachineInstrBuilder MIB;
	if (CalleeOp) {
	// Register-indirect call.
	unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
	.addReg(CalleeOp);
	} else {
	// Direct call.
	assert(GV && "Not a direct call");
	// See if we need any target-specific flags on the GV operand.
	unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);

	// This will be a direct call, or an indirect call through memory for
	// NonLazyBind calls or dllimport calls.
	bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT \|\|
	OpFlags == X86II::MO_GOTPCREL \|\|
	OpFlags == X86II::MO_COFFSTUB;
	unsigned CallOpc = NeedLoad
	? (Is64Bit ? X86::CALL64m : X86::CALL32m)
	: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);

	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
	if (NeedLoad)
	MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0);
	if (Symbol)
	MIB.addSym(Symbol, OpFlags);
	else
	MIB.addGlobalAddress(GV, 0, OpFlags);
	if (NeedLoad)
	MIB.addReg(0);
	}

	// Add a register mask operand representing the call-preserved registers.
	// Proper defs for return values will be added by setPhysRegsDeadExcept().
	MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));

	// Add an implicit use GOT pointer in EBX.
	if (Subtarget->isPICStyleGOT())
	MIB.addReg(X86::EBX, RegState::Implicit);

	if (Is64Bit && IsVarArg && !IsWin64)
	MIB.addReg(X86::AL, RegState::Implicit);

	// Add implicit physical register uses to the call.
	for (auto Reg : OutRegs)
	MIB.addReg(Reg, RegState::Implicit);

	// Issue CALLSEQ_END
	unsigned NumBytesForCalleeToPop =
	X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
	TM.Options.GuaranteedTailCallOpt)
	? NumBytes // Callee pops everything.
	: computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
	.addImm(NumBytes).addImm(NumBytesForCalleeToPop);

	// Now handle call return values.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
	CLI.RetTy->getContext());
	CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign &VA = RVLocs[i];
	EVT CopyVT = VA.getValVT();
	unsigned CopyReg = ResultReg + i;
	Register SrcReg = VA.getLocReg();

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64) &&
	((Is64Bit \|\| Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
	report_fatal_error("SSE register return with SSE disabled");
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	if ((SrcReg == X86::FP0 \|\| SrcReg == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	CopyVT = MVT::f80;
	CopyReg = createResultReg(&X86::RFP80RegClass);
	}

	// Copy out the result.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
	InRegs.push_back(VA.getLocReg());

	// Round the f80 to the right size, which also moves it to the appropriate
	// xmm register. This is accomplished by storing the f80 value in memory
	// and then loading it back.
	if (CopyVT != VA.getValVT()) {
	EVT ResVT = VA.getValVT();
	unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
	unsigned MemSize = ResVT.getSizeInBits()/8;
	int FI = MFI.CreateStackObject(MemSize, MemSize, false);
	addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc)), FI)
	.addReg(CopyReg);
	Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
	addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg + i), FI);
	}
	}

	CLI.ResultReg = ResultReg;
	CLI.NumResultRegs = RVLocs.size();
	CLI.Call = MIB;

	return true;
	}

	bool
	X86FastISel::fastSelectInstruction(const Instruction *I) {
	switch (I->getOpcode()) {
	default: break;
	case Instruction::Load:
	return X86SelectLoad(I);
	case Instruction::Store:
	return X86SelectStore(I);
	case Instruction::Ret:
	return X86SelectRet(I);
	case Instruction::ICmp:
	case Instruction::FCmp:
	return X86SelectCmp(I);
	case Instruction::ZExt:
	return X86SelectZExt(I);
	case Instruction::SExt:
	return X86SelectSExt(I);
	case Instruction::Br:
	return X86SelectBranch(I);
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::Shl:
	return X86SelectShift(I);
	case Instruction::SDiv:
	case Instruction::UDiv:
	case Instruction::SRem:
	case Instruction::URem:
	return X86SelectDivRem(I);
	case Instruction::Select:
	return X86SelectSelect(I);
	case Instruction::Trunc:
	return X86SelectTrunc(I);
	case Instruction::FPExt:
	return X86SelectFPExt(I);
	case Instruction::FPTrunc:
	return X86SelectFPTrunc(I);
	case Instruction::SIToFP:
	return X86SelectSIToFP(I);
	case Instruction::UIToFP:
	return X86SelectUIToFP(I);
	case Instruction::IntToPtr: // Deliberate fall-through.
	case Instruction::PtrToInt: {
	EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, I->getType());
	if (DstVT.bitsGT(SrcVT))
	return X86SelectZExt(I);
	if (DstVT.bitsLT(SrcVT))
	return X86SelectTrunc(I);
	unsigned Reg = getRegForValue(I->getOperand(0));
	if (Reg == 0) return false;
	updateValueMap(I, Reg);
	return true;
	}
	case Instruction::BitCast: {
	// Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
	if (!Subtarget->hasSSE2())
	return false;

	MVT SrcVT, DstVT;
	if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) \|\|
	!isTypeLegal(I->getType(), DstVT))
	return false;

	// Only allow vectors that use xmm/ymm/zmm.
	if (!SrcVT.isVector() \|\| !DstVT.isVector() \|\|
	SrcVT.getVectorElementType() == MVT::i1 \|\|
	DstVT.getVectorElementType() == MVT::i1)
	return false;

	unsigned Reg = getRegForValue(I->getOperand(0));
	if (Reg == 0)
	return false;

	// No instruction is needed for conversion. Reuse the register used by
	// the fist operand.
	updateValueMap(I, Reg);
	return true;
	}
	}

	return false;
	}

	unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
	if (VT > MVT::i64)
	return 0;

	uint64_t Imm = CI->getZExtValue();
	if (Imm == 0) {
	unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected value type");
	case MVT::i1:
	case MVT::i8:
	return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /Kill=/true,
	X86::sub_8bit);
	case MVT::i16:
	return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /Kill=/true,
	X86::sub_16bit);
	case MVT::i32:
	return SrcReg;
	case MVT::i64: {
	unsigned ResultReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
	.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
	return ResultReg;
	}
	}
	}

	unsigned Opc = 0;
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected value type");
	case MVT::i1:
	VT = MVT::i8;
	LLVM_FALLTHROUGH;
	case MVT::i8: Opc = X86::MOV8ri; break;
	case MVT::i16: Opc = X86::MOV16ri; break;
	case MVT::i32: Opc = X86::MOV32ri; break;
	case MVT::i64: {
	if (isUInt<32>(Imm))
	Opc = X86::MOV32ri64;
	else if (isInt<32>(Imm))
	Opc = X86::MOV64ri32;
	else
	Opc = X86::MOV64ri;
	break;
	}
	}
	return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
	}

	unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
	if (CFP->isNullValue())
	return fastMaterializeFloatZero(CFP);

	// Can't handle alternate code models yet.
	CodeModel::Model CM = TM.getCodeModel();
	if (CM != CodeModel::Small && CM != CodeModel::Large)
	return 0;

	// Get opcode and regclass of the output for the given load instruction.
	unsigned Opc = 0;
	bool HasAVX = Subtarget->hasAVX();
	bool HasAVX512 = Subtarget->hasAVX512();
	switch (VT.SimpleTy) {
	default: return 0;
	case MVT::f32:
	if (X86ScalarSSEf32)
	Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
	HasAVX ? X86::VMOVSSrm_alt :
	X86::MOVSSrm_alt;
	else
	Opc = X86::LD_Fp32m;
	break;
	case MVT::f64:
	if (X86ScalarSSEf64)
	Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
	HasAVX ? X86::VMOVSDrm_alt :
	X86::MOVSDrm_alt;
	else
	Opc = X86::LD_Fp64m;
	break;
	case MVT::f80:
	// No f80 support yet.
	return 0;
	}

	// MachineConstantPool wants an explicit alignment.
	unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
	if (Align == 0) {
	// Alignment of vector types. FIXME!
	Align = DL.getTypeAllocSize(CFP->getType());
	}

	// x86-32 PIC requires a PIC base register for constant pools.
	unsigned PICBase = 0;
	unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
	if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
	PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	else if (OpFlag == X86II::MO_GOTOFF)
	PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small)
	PICBase = X86::RIP;

	// Create the load from the constant pool.
	unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
	unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));

	if (CM == CodeModel::Large) {
	unsigned AddrReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
	AddrReg)
	.addConstantPoolIndex(CPI, 0, OpFlag);
	MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg);
	addDirectMem(MIB, AddrReg);
	MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
	MachinePointerInfo::getConstantPool(*FuncInfo.MF),
	MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
	MIB->addMemOperand(*FuncInfo.MF, MMO);
	return ResultReg;
	}

	addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg),
	CPI, PICBase, OpFlag);
	return ResultReg;
	}

	unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
	// Can't handle alternate code models yet.
	if (TM.getCodeModel() != CodeModel::Small)
	return 0;

	// Materialize addresses with LEA/MOV instructions.
	X86AddressMode AM;
	if (X86SelectAddress(GV, AM)) {
	// If the expression is just a basereg, then we're done, otherwise we need
	// to emit an LEA.
	if (AM.BaseType == X86AddressMode::RegBase &&
	AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
	return AM.Base.Reg;

	unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
	if (TM.getRelocationModel() == Reloc::Static &&
	TLI.getPointerTy(DL) == MVT::i64) {
	// The displacement code could be more than 32 bits away so we need to use
	// an instruction with a 64 bit immediate
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
	ResultReg)
	.addGlobalAddress(GV);
	} else {
	unsigned Opc =
	TLI.getPointerTy(DL) == MVT::i32
	? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
	: X86::LEA64r;
	addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg), AM);
	}
	return ResultReg;
	}
	return 0;
	}

	unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
	EVT CEVT = TLI.getValueType(DL, C->getType(), true);

	// Only handle simple types.
	if (!CEVT.isSimple())
	return 0;
	MVT VT = CEVT.getSimpleVT();

	if (const auto *CI = dyn_cast<ConstantInt>(C))
	return X86MaterializeInt(CI, VT);
	else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
	return X86MaterializeFP(CFP, VT);
	else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
	return X86MaterializeGV(GV, VT);

	return 0;
	}

	unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
	// Fail on dynamic allocas. At this point, getRegForValue has already
	// checked its CSE maps, so if we're here trying to handle a dynamic
	// alloca, we're not going to succeed. X86SelectAddress has a
	// check for dynamic allocas, because it's called directly from
	// various places, but targetMaterializeAlloca also needs a check
	// in order to avoid recursion between getRegForValue,
	// X86SelectAddrss, and targetMaterializeAlloca.
	if (!FuncInfo.StaticAllocaMap.count(C))
	return 0;
	assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");

	X86AddressMode AM;
	if (!X86SelectAddress(C, AM))
	return 0;
	unsigned Opc =
	TLI.getPointerTy(DL) == MVT::i32
	? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
	: X86::LEA64r;
	const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
	unsigned ResultReg = createResultReg(RC);
	addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg), AM);
	return ResultReg;
	}

	unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
	MVT VT;
	if (!isTypeLegal(CF->getType(), VT))
	return 0;

	// Get opcode and regclass for the given zero.
	bool HasAVX512 = Subtarget->hasAVX512();
	unsigned Opc = 0;
	switch (VT.SimpleTy) {
	default: return 0;
	case MVT::f32:
	if (X86ScalarSSEf32)
	Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
	else
	Opc = X86::LD_Fp032;
	break;
	case MVT::f64:
	if (X86ScalarSSEf64)
	Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
	else
	Opc = X86::LD_Fp064;
	break;
	case MVT::f80:
	// No f80 support yet.
	return 0;
	}

	unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
	return ResultReg;
	}


	bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
	const LoadInst *LI) {
	const Value *Ptr = LI->getPointerOperand();
	X86AddressMode AM;
	if (!X86SelectAddress(Ptr, AM))
	return false;

	const X86InstrInfo &XII = (const X86InstrInfo &)TII;

	unsigned Size = DL.getTypeAllocSize(LI->getType());
	unsigned Alignment = LI->getAlignment();

	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = DL.getABITypeAlignment(LI->getType());

	SmallVector<MachineOperand, 8> AddrOps;
	AM.getFullAddress(AddrOps);

	MachineInstr *Result = XII.foldMemoryOperandImpl(
	FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
	/AllowCommute=/true);
	if (!Result)
	return false;

	// The index register could be in the wrong register class. Unfortunately,
	// foldMemoryOperandImpl could have commuted the instruction so its not enough
	// to just look at OpNo + the offset to the index reg. We actually need to
	// scan the instruction to find the index reg and see if its the correct reg
	// class.
	unsigned OperandNo = 0;
	for (MachineInstr::mop_iterator I = Result->operands_begin(),
	E = Result->operands_end(); I != E; ++I, ++OperandNo) {
	MachineOperand &MO = *I;
	if (!MO.isReg() \|\| MO.isDef() \|\| MO.getReg() != AM.IndexReg)
	continue;
	// Found the index reg, now try to rewrite it.
	unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
	MO.getReg(), OperandNo);
	if (IndexReg == MO.getReg())
	continue;
	MO.setReg(IndexReg);
	}

	Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
	Result->cloneInstrSymbols(FuncInfo.MF, MI);
	MachineBasicBlock::iterator I(MI);
	removeDeadCode(I, std::next(I));
	return true;
	}

	unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
	const TargetRegisterClass *RC,
	unsigned Op0, bool Op0IsKill,
	unsigned Op1, bool Op1IsKill,
	unsigned Op2, bool Op2IsKill,
	unsigned Op3, bool Op3IsKill) {
	const MCInstrDesc &II = TII.get(MachineInstOpcode);

	unsigned ResultReg = createResultReg(RC);
	Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
	Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
	Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
	Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);

	if (II.getNumDefs() >= 1)
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
	.addReg(Op0, getKillRegState(Op0IsKill))
	.addReg(Op1, getKillRegState(Op1IsKill))
	.addReg(Op2, getKillRegState(Op2IsKill))
	.addReg(Op3, getKillRegState(Op3IsKill));
	else {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
	.addReg(Op0, getKillRegState(Op0IsKill))
	.addReg(Op1, getKillRegState(Op1IsKill))
	.addReg(Op2, getKillRegState(Op2IsKill))
	.addReg(Op3, getKillRegState(Op3IsKill));
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
	}
	return ResultReg;
	}


	namespace llvm {
	FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) {
	return new X86FastISel(funcInfo, libInfo);
	}
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp (revision 362609)
	@@ -1,3221 +1,3221 @@
	//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the X86 implementation of TargetFrameLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86InstrInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cstdlib>

	using namespace llvm;

	X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
	MaybeAlign StackAlignOverride)
	: TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
	STI.is64Bit() ? -8 : -4),
	STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
	// Cache a bunch of frame-related predicates for this subtarget.
	SlotSize = TRI->getSlotSize();
	Is64Bit = STI.is64Bit();
	IsLP64 = STI.isTarget64BitLP64();
	// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
	Uses64BitFramePtr = STI.isTarget64BitLP64() \|\| STI.isTargetNaCl64();
	StackPtr = TRI->getStackRegister();
	}

	bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	return !MF.getFrameInfo().hasVarSizedObjects() &&
	!MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
	}

	/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
	/// call frame pseudos can be simplified. Having a FP, as in the default
	/// implementation, is not sufficient here since we can't always use it.
	/// Use a more nuanced condition.
	bool
	X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
	return hasReservedCallFrame(MF) \|\|
	(hasFP(MF) && !TRI->needsStackRealignment(MF)) \|\|
	TRI->hasBasePointer(MF);
	}

	// needsFrameIndexResolution - Do we need to perform FI resolution for
	// this function. Normally, this is required only when the function
	// has any stack objects. However, FI resolution actually has another job,
	// not apparent from the title - it resolves callframesetup/destroy
	// that were not simplified earlier.
	// So, this is required for x86 functions that have push sequences even
	// when there are no stack objects.
	bool
	X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
	return MF.getFrameInfo().hasStackObjects() \|\|
	MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
	}

	/// hasFP - Return true if the specified function should have a dedicated frame
	/// pointer register. This is true if the function has variable sized allocas
	/// or if frame pointer elimination is disabled.
	bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	return (MF.getTarget().Options.DisableFramePointerElim(MF) \|\|
	TRI->needsStackRealignment(MF) \|\|
	MFI.hasVarSizedObjects() \|\|
	MFI.isFrameAddressTaken() \|\| MFI.hasOpaqueSPAdjustment() \|\|
	MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() \|\|
	MF.callsUnwindInit() \|\| MF.hasEHFunclets() \|\| MF.callsEHReturn() \|\|
	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
	MFI.hasCopyImplyingStackAdjustment());
	}

	static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::SUB64ri8;
	return X86::SUB64ri32;
	} else {
	if (isInt<8>(Imm))
	return X86::SUB32ri8;
	return X86::SUB32ri;
	}
	}

	static unsigned getADDriOpcode(bool IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::ADD64ri8;
	return X86::ADD64ri32;
	} else {
	if (isInt<8>(Imm))
	return X86::ADD32ri8;
	return X86::ADD32ri;
	}
	}

	static unsigned getSUBrrOpcode(bool IsLP64) {
	return IsLP64 ? X86::SUB64rr : X86::SUB32rr;
	}

	static unsigned getADDrrOpcode(bool IsLP64) {
	return IsLP64 ? X86::ADD64rr : X86::ADD32rr;
	}

	static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::AND64ri8;
	return X86::AND64ri32;
	}
	if (isInt<8>(Imm))
	return X86::AND32ri8;
	return X86::AND32ri;
	}

	static unsigned getLEArOpcode(bool IsLP64) {
	return IsLP64 ? X86::LEA64r : X86::LEA32r;
	}

	/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
	/// when it reaches the "return" instruction. We can then pop a stack object
	/// to this register without worry about clobbering it.
	static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	const X86RegisterInfo *TRI,
	bool Is64Bit) {
	const MachineFunction *MF = MBB.getParent();
	if (MF->callsEHReturn())
	return 0;

	const TargetRegisterClass &AvailableRegs = TRI->getGPRsForTailCall(MF);

	if (MBBI == MBB.end())
	return 0;

	switch (MBBI->getOpcode()) {
	default: return 0;
	case TargetOpcode::PATCHABLE_RET:
	case X86::RET:
	case X86::RETL:
	case X86::RETQ:
	case X86::RETIL:
	case X86::RETIQ:
	case X86::TCRETURNdi:
	case X86::TCRETURNri:
	case X86::TCRETURNmi:
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	SmallSet<uint16_t, 8> Uses;
	for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MBBI->getOperand(i);
	if (!MO.isReg() \|\| MO.isDef())
	continue;
	Register Reg = MO.getReg();
	if (!Reg)
	continue;
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	Uses.insert(*AI);
	}

	for (auto CS : AvailableRegs)
	if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP &&
	CS != X86::ESP)
	return CS;
	}
	}

	return 0;
	}

	static bool isEAXLiveIn(MachineBasicBlock &MBB) {
	for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
	unsigned Reg = RegMask.PhysReg;

	if (Reg == X86::RAX \|\| Reg == X86::EAX \|\| Reg == X86::AX \|\|
	Reg == X86::AH \|\| Reg == X86::AL)
	return true;
	}

	return false;
	}

	/// Check if the flags need to be preserved before the terminators.
	/// This would be the case, if the eflags is live-in of the region
	/// composed by the terminators or live-out of that region, without
	/// being defined by a terminator.
	static bool
	flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
	for (const MachineInstr &MI : MBB.terminators()) {
	bool BreakNext = false;
	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isReg())
	continue;
	Register Reg = MO.getReg();
	if (Reg != X86::EFLAGS)
	continue;

	// This terminator needs an eflags that is not defined
	// by a previous another terminator:
	// EFLAGS is live-in of the region composed by the terminators.
	if (!MO.isDef())
	return true;
	// This terminator defines the eflags, i.e., we don't need to preserve it.
	// However, we still need to check this specific terminator does not
	// read a live-in value.
	BreakNext = true;
	}
	// We found a definition of the eflags, no need to preserve them.
	if (BreakNext)
	return false;
	}

	// None of the terminators use or define the eflags.
	// Check if they are live-out, that would imply we need to preserve them.
	for (const MachineBasicBlock *Succ : MBB.successors())
	if (Succ->isLiveIn(X86::EFLAGS))
	return true;

	return false;
	}

	/// emitSPUpdate - Emit a series of instructions to increment / decrement the
	/// stack pointer by a constant value.
	void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	const DebugLoc &DL,
	int64_t NumBytes, bool InEpilogue) const {
	bool isSub = NumBytes < 0;
	uint64_t Offset = isSub ? -NumBytes : NumBytes;
	MachineInstr::MIFlag Flag =
	isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;

	uint64_t Chunk = (1LL << 31) - 1;

	if (Offset > Chunk) {
	// Rather than emit a long series of instructions for large offsets,
	// load the offset into a register and do one sub/add
	unsigned Reg = 0;
	unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);

	if (isSub && !isEAXLiveIn(MBB))
	Reg = Rax;
	else
	Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);

	unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
	unsigned AddSubRROpc =
	isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
	if (Reg) {
	BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
	.addImm(Offset)
	.setMIFlag(Flag);
	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
	.addReg(StackPtr)
	.addReg(Reg);
	MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
	return;
	} else if (Offset > 8 * Chunk) {
	// If we would need more than 8 add or sub instructions (a >16GB stack
	// frame), it's worth spilling RAX to materialize this immediate.
	// pushq %rax
	// movabsq +-$Offset+-SlotSize, %rax
	// addq %rsp, %rax
	// xchg %rax, (%rsp)
	// movq (%rsp), %rsp
	assert(Is64Bit && "can't have 32-bit 16GB stack frame");
	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
	.addReg(Rax, RegState::Kill)
	.setMIFlag(Flag);
	// Subtract is not commutative, so negate the offset and always use add.
	// Subtract 8 less and add 8 more to account for the PUSH we just did.
	if (isSub)
	Offset = -(Offset - SlotSize);
	else
	Offset = Offset + SlotSize;
	BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
	.addImm(Offset)
	.setMIFlag(Flag);
	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
	.addReg(Rax)
	.addReg(StackPtr);
	MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
	// Exchange the new SP in RAX with the top of the stack.
	addRegOffset(
	BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
	StackPtr, false, 0);
	// Load new SP from the top of the stack into RSP.
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
	StackPtr, false, 0);
	return;
	}
	}

	while (Offset) {
	uint64_t ThisVal = std::min(Offset, Chunk);
	if (ThisVal == SlotSize) {
	// Use push / pop for slot sized adjustments as a size optimization. We
	// need to find a dead register when using pop.
	unsigned Reg = isSub
	? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
	: findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
	if (Reg) {
	unsigned Opc = isSub
	? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
	: (Is64Bit ? X86::POP64r : X86::POP32r);
	BuildMI(MBB, MBBI, DL, TII.get(Opc))
	.addReg(Reg, getDefRegState(!isSub) \| getUndefRegState(isSub))
	.setMIFlag(Flag);
	Offset -= ThisVal;
	continue;
	}
	}

	BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
	.setMIFlag(Flag);

	Offset -= ThisVal;
	}
	}

	MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
	assert(Offset != 0 && "zero offset stack adjustment requested");

	// On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
	// is tricky.
	bool UseLEA;
	if (!InEpilogue) {
	// Check if inserting the prologue at the beginning
	// of MBB would require to use LEA operations.
	// We need to use LEA operations if EFLAGS is live in, because
	// it means an instruction will read it before it gets defined.
	UseLEA = STI.useLeaForSP() \|\| MBB.isLiveIn(X86::EFLAGS);
	} else {
	// If we can use LEA for SP but we shouldn't, check that none
	// of the terminators uses the eflags. Otherwise we will insert
	// a ADD that will redefine the eflags and break the condition.
	// Alternatively, we could move the ADD, but this may not be possible
	// and is an optimization anyway.
	UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
	if (UseLEA && !STI.useLeaForSP())
	UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
	// If that assert breaks, that means we do not do the right thing
	// in canUseAsEpilogue.
	assert((UseLEA \|\| !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
	"We shouldn't have allowed this insertion point");
	}

	MachineInstrBuilder MI;
	if (UseLEA) {
	MI = addRegOffset(BuildMI(MBB, MBBI, DL,
	TII.get(getLEArOpcode(Uses64BitFramePtr)),
	StackPtr),
	StackPtr, false, Offset);
	} else {
	bool IsSub = Offset < 0;
	uint64_t AbsOffset = IsSub ? -Offset : Offset;
	unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
	: getADDriOpcode(Uses64BitFramePtr, AbsOffset);
	MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
	.addReg(StackPtr)
	.addImm(AbsOffset);
	MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
	}
	return MI;
	}

	int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	bool doMergeWithPrevious) const {
	if ((doMergeWithPrevious && MBBI == MBB.begin()) \|\|
	(!doMergeWithPrevious && MBBI == MBB.end()))
	return 0;

	MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;

	PI = skipDebugInstructionsBackward(PI, MBB.begin());
	// It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
	// instruction, and that there are no DBG_VALUE or other instructions between
	// ADD/SUB/LEA and its corresponding CFI instruction.
	/* TODO: Add support for the case where there are multiple CFI instructions
	below the ADD/SUB/LEA, e.g.:
	...
	add
	cfi_def_cfa_offset
	cfi_offset
	...
	*/
	if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
	PI = std::prev(PI);

	unsigned Opc = PI->getOpcode();
	int Offset = 0;

	if ((Opc == X86::ADD64ri32 \|\| Opc == X86::ADD64ri8 \|\|
	Opc == X86::ADD32ri \|\| Opc == X86::ADD32ri8) &&
	PI->getOperand(0).getReg() == StackPtr){
	assert(PI->getOperand(1).getReg() == StackPtr);
	Offset = PI->getOperand(2).getImm();
	} else if ((Opc == X86::LEA32r \|\| Opc == X86::LEA64_32r) &&
	PI->getOperand(0).getReg() == StackPtr &&
	PI->getOperand(1).getReg() == StackPtr &&
	PI->getOperand(2).getImm() == 1 &&
	PI->getOperand(3).getReg() == X86::NoRegister &&
	PI->getOperand(5).getReg() == X86::NoRegister) {
	// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
	Offset = PI->getOperand(4).getImm();
	} else if ((Opc == X86::SUB64ri32 \|\| Opc == X86::SUB64ri8 \|\|
	Opc == X86::SUB32ri \|\| Opc == X86::SUB32ri8) &&
	PI->getOperand(0).getReg() == StackPtr) {
	assert(PI->getOperand(1).getReg() == StackPtr);
	Offset = -PI->getOperand(2).getImm();
	} else
	return 0;

	PI = MBB.erase(PI);
	if (PI != MBB.end() && PI->isCFIInstruction()) PI = MBB.erase(PI);
	if (!doMergeWithPrevious)
	MBBI = skipDebugInstructionsForward(PI, MBB.end());

	return Offset;
	}

	void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	const MCCFIInstruction &CFIInst) const {
	MachineFunction &MF = *MBB.getParent();
	unsigned CFIIndex = MF.addFrameInst(CFIInst);
	BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	}

	void X86FrameLowering::emitCalleeSavedFrameMoves(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineModuleInfo &MMI = MF.getMMI();
	const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();

	// Add callee saved registers to move list.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	if (CSI.empty()) return;

	// Calculate offsets.
	for (std::vector<CalleeSavedInfo>::const_iterator
	I = CSI.begin(), E = CSI.end(); I != E; ++I) {
	int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
	unsigned Reg = I->getReg();

	unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
	}
	}

	void X86FrameLowering::emitStackProbe(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, bool InProlog) const {
	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
	if (STI.isTargetWindowsCoreCLR()) {
	if (InProlog) {
	emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
	} else {
	emitStackProbeInline(MF, MBB, MBBI, DL, false);
	}
	} else {
	emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
	}
	}

	void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
	MachineBasicBlock &PrologMBB) const {
	const StringRef ChkStkStubSymbol = "__chkstk_stub";
	MachineInstr *ChkStkStub = nullptr;

	for (MachineInstr &MI : PrologMBB) {
	if (MI.isCall() && MI.getOperand(0).isSymbol() &&
	ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
	ChkStkStub = &MI;
	break;
	}
	}

	if (ChkStkStub != nullptr) {
	assert(!ChkStkStub->isBundled() &&
	"Not expecting bundled instructions here");
	MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
	assert(std::prev(MBBI) == ChkStkStub &&
	"MBBI expected after __chkstk_stub.");
	DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
	emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
	ChkStkStub->eraseFromParent();
	}
	}

	void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	bool InProlog) const {
	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
	assert(STI.is64Bit() && "different expansion needed for 32 bit");
	assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
	const TargetInstrInfo &TII = *STI.getInstrInfo();
	const BasicBlock *LLVM_BB = MBB.getBasicBlock();

	// RAX contains the number of bytes of desired stack adjustment.
	// The handling here assumes this value has already been updated so as to
	// maintain stack alignment.
	//
	// We need to exit with RSP modified by this amount and execute suitable
	// page touches to notify the OS that we're growing the stack responsibly.
	// All stack probing must be done without modifying RSP.
	//
	// MBB:
	// SizeReg = RAX;
	// ZeroReg = 0
	// CopyReg = RSP
	// Flags, TestReg = CopyReg - SizeReg
	// FinalReg = !Flags.Ovf ? TestReg : ZeroReg
	// LimitReg = gs magic thread env access
	// if FinalReg >= LimitReg goto ContinueMBB
	// RoundBB:
	// RoundReg = page address of FinalReg
	// LoopMBB:
	// LoopReg = PHI(LimitReg,ProbeReg)
	// ProbeReg = LoopReg - PageSize
	// [ProbeReg] = 0
	// if (ProbeReg > RoundReg) goto LoopMBB
	// ContinueMBB:
	// RSP = RSP - RAX
	// [rest of original MBB]

	// Set up the new basic blocks
	MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
	MF.insert(MBBIter, RoundMBB);
	MF.insert(MBBIter, LoopMBB);
	MF.insert(MBBIter, ContinueMBB);

	// Split MBB and move the tail portion down to ContinueMBB.
	MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
	ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
	ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);

	// Some useful constants
	const int64_t ThreadEnvironmentStackLimit = 0x10;
	const int64_t PageSize = 0x1000;
	const int64_t PageMask = ~(PageSize - 1);

	// Registers we need. For the normal case we use virtual
	// registers. For the prolog expansion we use RAX, RCX and RDX.
	MachineRegisterInfo &MRI = MF.getRegInfo();
	const TargetRegisterClass *RegClass = &X86::GR64RegClass;
	const Register SizeReg = InProlog ? X86::RAX
	: MRI.createVirtualRegister(RegClass),
	ZeroReg = InProlog ? X86::RCX
	: MRI.createVirtualRegister(RegClass),
	CopyReg = InProlog ? X86::RDX
	: MRI.createVirtualRegister(RegClass),
	TestReg = InProlog ? X86::RDX
	: MRI.createVirtualRegister(RegClass),
	FinalReg = InProlog ? X86::RDX
	: MRI.createVirtualRegister(RegClass),
	RoundedReg = InProlog ? X86::RDX
	: MRI.createVirtualRegister(RegClass),
	LimitReg = InProlog ? X86::RCX
	: MRI.createVirtualRegister(RegClass),
	JoinReg = InProlog ? X86::RCX
	: MRI.createVirtualRegister(RegClass),
	ProbeReg = InProlog ? X86::RCX
	: MRI.createVirtualRegister(RegClass);

	// SP-relative offsets where we can save RCX and RDX.
	int64_t RCXShadowSlot = 0;
	int64_t RDXShadowSlot = 0;

	// If inlining in the prolog, save RCX and RDX.
	if (InProlog) {
	// Compute the offsets. We need to account for things already
	// pushed onto the stack at this point: return address, frame
	// pointer (if used), and callee saves.
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
	const bool HasFP = hasFP(MF);

	// Check if we need to spill RCX and/or RDX.
	// Here we assume that no earlier prologue instruction changes RCX and/or
	// RDX, so checking the block live-ins is enough.
	const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX);
	const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX);
	int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
	// Assign the initial slot to both registers, then change RDX's slot if both
	// need to be spilled.
	if (IsRCXLiveIn)
	RCXShadowSlot = InitSlot;
	if (IsRDXLiveIn)
	RDXShadowSlot = InitSlot;
	if (IsRDXLiveIn && IsRCXLiveIn)
	RDXShadowSlot += 8;
	// Emit the saves if needed.
	if (IsRCXLiveIn)
	addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
	RCXShadowSlot)
	.addReg(X86::RCX);
	if (IsRDXLiveIn)
	addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
	RDXShadowSlot)
	.addReg(X86::RDX);
	} else {
	// Not in the prolog. Copy RAX to a virtual reg.
	BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
	}

	// Add code to MBB to check for overflow and set the new target stack pointer
	// to zero if so.
	BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
	.addReg(ZeroReg, RegState::Undef)
	.addReg(ZeroReg, RegState::Undef);
	BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
	BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
	.addReg(CopyReg)
	.addReg(SizeReg);
	BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
	.addReg(TestReg)
	.addReg(ZeroReg)
	.addImm(X86::COND_B);

	// FinalReg now holds final stack pointer value, or zero if
	// allocation would overflow. Compare against the current stack
	// limit from the thread environment block. Note this limit is the
	// lowest touched page on the stack, not the point at which the OS
	// will cause an overflow exception, so this is just an optimization
	// to avoid unnecessarily touching pages that are below the current
	// SP but already committed to the stack by the OS.
	BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
	.addReg(0)
	.addImm(1)
	.addReg(0)
	.addImm(ThreadEnvironmentStackLimit)
	.addReg(X86::GS);
	BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
	// Jump if the desired stack pointer is at or above the stack limit.
	BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE);

	// Add code to roundMBB to round the final stack pointer to a page boundary.
	RoundMBB->addLiveIn(FinalReg);
	BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
	.addReg(FinalReg)
	.addImm(PageMask);
	BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);

	// LimitReg now holds the current stack limit, RoundedReg page-rounded
	// final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
	// and probe until we reach RoundedReg.
	if (!InProlog) {
	BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
	.addReg(LimitReg)
	.addMBB(RoundMBB)
	.addReg(ProbeReg)
	.addMBB(LoopMBB);
	}

	LoopMBB->addLiveIn(JoinReg);
	addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
	false, -PageSize);

	// Probe by storing a byte onto the stack.
	BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
	.addReg(ProbeReg)
	.addImm(1)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addImm(0);

	LoopMBB->addLiveIn(RoundedReg);
	BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
	.addReg(RoundedReg)
	.addReg(ProbeReg);
	BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE);

	MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();

	// If in prolog, restore RDX and RCX.
	if (InProlog) {
	if (RCXShadowSlot) // It means we spilled RCX in the prologue.
	addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
	TII.get(X86::MOV64rm), X86::RCX),
	X86::RSP, false, RCXShadowSlot);
	if (RDXShadowSlot) // It means we spilled RDX in the prologue.
	addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
	TII.get(X86::MOV64rm), X86::RDX),
	X86::RSP, false, RDXShadowSlot);
	}

	// Now that the probing is done, add code to continueMBB to update
	// the stack pointer for real.
	ContinueMBB->addLiveIn(SizeReg);
	BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
	.addReg(X86::RSP)
	.addReg(SizeReg);

	// Add the control flow edges we need.
	MBB.addSuccessor(ContinueMBB);
	MBB.addSuccessor(RoundMBB);
	RoundMBB->addSuccessor(LoopMBB);
	LoopMBB->addSuccessor(ContinueMBB);
	LoopMBB->addSuccessor(LoopMBB);

	// Mark all the instructions added to the prolog as frame setup.
	if (InProlog) {
	for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
	BeforeMBBI->setFlag(MachineInstr::FrameSetup);
	}
	for (MachineInstr &MI : *RoundMBB) {
	MI.setFlag(MachineInstr::FrameSetup);
	}
	for (MachineInstr &MI : *LoopMBB) {
	MI.setFlag(MachineInstr::FrameSetup);
	}
	for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
	CMBBI != ContinueMBBI; ++CMBBI) {
	CMBBI->setFlag(MachineInstr::FrameSetup);
	}
	}
	}

	void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	bool InProlog) const {
	bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;

	- // FIXME: Add retpoline support and remove this.
	- if (Is64Bit && IsLargeCodeModel && STI.useRetpolineIndirectCalls())
	+ // FIXME: Add indirect thunk support and remove this.
	+ if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
	report_fatal_error("Emitting stack probe calls on 64-bit with the large "
	- "code model and retpoline not yet implemented.");
	+ "code model and indirect thunks not yet implemented.");

	unsigned CallOp;
	if (Is64Bit)
	CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
	else
	CallOp = X86::CALLpcrel32;

	StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);

	MachineInstrBuilder CI;
	MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);

	// All current stack probes take AX and SP as input, clobber flags, and
	// preserve all registers. x86_64 probes leave RSP unmodified.
	if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
	// For the large code model, we have to call through a register. Use R11,
	// as it is scratch in all supported calling conventions.
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
	.addExternalSymbol(MF.createExternalSymbolName(Symbol));
	CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
	} else {
	CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
	.addExternalSymbol(MF.createExternalSymbolName(Symbol));
	}

	unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
	unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
	CI.addReg(AX, RegState::Implicit)
	.addReg(SP, RegState::Implicit)
	.addReg(AX, RegState::Define \| RegState::Implicit)
	.addReg(SP, RegState::Define \| RegState::Implicit)
	.addReg(X86::EFLAGS, RegState::Define \| RegState::Implicit);

	if (STI.isTargetWin64() \|\| !STI.isOSWindows()) {
	// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
	// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
	// themselves. They also does not clobber %rax so we can reuse it when
	// adjusting %rsp.
	// All other platforms do not specify a particular ABI for the stack probe
	// function, so we arbitrarily define it to not adjust %esp/%rsp itself.
	BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
	.addReg(SP)
	.addReg(AX);
	}

	if (InProlog) {
	// Apply the frame setup flag to all inserted instrs.
	for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
	ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
	}
	}

	void X86FrameLowering::emitStackProbeInlineStub(
	MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {

	assert(InProlog && "ChkStkStub called outside prolog!");

	BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
	.addExternalSymbol("__chkstk_stub");
	}

	static unsigned calculateSetFPREG(uint64_t SPAdjust) {
	// Win64 ABI has a less restrictive limitation of 240; 128 works equally well
	// and might require smaller successive adjustments.
	const uint64_t Win64MaxSEHOffset = 128;
	uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
	// Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
	return SEHFrameOffset & -16;
	}

	// If we're forcing a stack realignment we can't rely on just the frame
	// info, we need to know the ABI stack alignment as well in case we
	// have a call out. Otherwise just make sure we have some alignment - we'll
	// go with the minimum SlotSize.
	uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
	unsigned StackAlign = getStackAlignment();
	if (MF.getFunction().hasFnAttribute("stackrealign")) {
	if (MFI.hasCalls())
	MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
	else if (MaxAlign < SlotSize)
	MaxAlign = SlotSize;
	}
	return MaxAlign;
	}

	void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, unsigned Reg,
	uint64_t MaxAlign) const {
	uint64_t Val = -MaxAlign;
	unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
	.addReg(Reg)
	.addImm(Val)
	.setMIFlag(MachineInstr::FrameSetup);

	// The EFLAGS implicit def is dead.
	MI->getOperand(3).setIsDead();
	}

	bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
	// x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
	// clobbered by any interrupt handler.
	assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
	"MF used frame lowering for wrong subtarget");
	const Function &Fn = MF.getFunction();
	const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
	return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
	}


	/// emitPrologue - Push callee-saved registers onto the stack, which
	/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
	/// space for local variables. Also emit labels used by the exception handler to
	/// generate the exception handling frames.

	/*
	Here's a gist of what gets emitted:

	; Establish frame pointer, if needed
	[if needs FP]
	push %rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	.seh_pushreg %rpb
	mov %rsp, %rbp
	.cfi_def_cfa_register %rbp

	; Spill general-purpose registers
	[for all callee-saved GPRs]
	pushq %<reg>
	[if not needs FP]
	.cfi_def_cfa_offset (offset from RETADDR)
	.seh_pushreg %<reg>

	; If the required stack alignment > default stack alignment
	; rsp needs to be re-aligned. This creates a "re-alignment gap"
	; of unknown size in the stack frame.
	[if stack needs re-alignment]
	and $MASK, %rsp

	; Allocate space for locals
	[if target is Windows and allocated space > 4096 bytes]
	; Windows needs special care for allocations larger
	; than one page.
	mov $NNN, %rax
	call ___chkstk_ms/___chkstk
	sub %rax, %rsp
	[else]
	sub $NNN, %rsp

	[if needs FP]
	.seh_stackalloc (size of XMM spill slots)
	.seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
	[else]
	.seh_stackalloc NNN

	; Spill XMMs
	; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
	; they may get spilled on any platform, if the current function
	; calls @llvm.eh.unwind.init
	[if needs FP]
	[for all callee-saved XMM registers]
	movaps %<xmm reg>, -MMM(%rbp)
	[for all callee-saved XMM registers]
	.seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
	; i.e. the offset relative to (%rbp - SEHFrameOffset)
	[else]
	[for all callee-saved XMM registers]
	movaps %<xmm reg>, KKK(%rsp)
	[for all callee-saved XMM registers]
	.seh_savexmm %<xmm reg>, KKK

	.seh_endprologue

	[if needs base pointer]
	mov %rsp, %rbx
	[if needs to restore base pointer]
	mov %rsp, -MMM(%rbp)

	; Emit CFI info
	[if needs FP]
	[for all callee-saved registers]
	.cfi_offset %<reg>, (offset from %rbp)
	[else]
	.cfi_def_cfa_offset (offset from RETADDR)
	[for all callee-saved registers]
	.cfi_offset %<reg>, (offset from %rsp)

	Notes:
	- .seh directives are emitted only for Windows 64 ABI
	- .cv_fpo directives are emitted on win32 when emitting CodeView
	- .cfi directives are emitted for all other ABIs
	- for 32-bit code, substitute %e?? registers for %r??
	*/

	void X86FrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
	"MF used frame lowering for wrong subtarget");
	MachineBasicBlock::iterator MBBI = MBB.begin();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const Function &Fn = MF.getFunction();
	MachineModuleInfo &MMI = MF.getMMI();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
	uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
	bool IsFunclet = MBB.isEHFuncletEntry();
	EHPersonality Personality = EHPersonality::Unknown;
	if (Fn.hasPersonalityFn())
	Personality = classifyEHPersonality(Fn.getPersonalityFn());
	bool FnHasClrFunclet =
	MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
	bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
	bool HasFP = hasFP(MF);
	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
	// FIXME: Emit FPO data for EH funclets.
	bool NeedsWinFPO =
	!IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
	bool NeedsWinCFI = NeedsWin64CFI \|\| NeedsWinFPO;
	bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
	Register FramePtr = TRI->getFrameRegister(MF);
	const Register MachineFramePtr =
	STI.isTarget64BitILP32()
	? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
	Register BasePtr = TRI->getBaseRegister();
	bool HasWinCFI = false;

	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc DL;

	// Add RETADDR move area to callee saved frame size.
	int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
	if (TailCallReturnAddrDelta && IsWin64Prologue)
	report_fatal_error("Can't handle guaranteed tail call under win64 yet");

	if (TailCallReturnAddrDelta < 0)
	X86FI->setCalleeSavedFrameSize(
	X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);

	bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
	unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);

	// Re-align the stack on 64-bit if the x86-interrupt calling convention is
	// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
	// stack alignment.
	if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
	Fn.arg_size() == 2) {
	StackSize += 8;
	MFI.setStackSize(StackSize);
	emitSPUpdate(MBB, MBBI, DL, -8, /InEpilogue=/false);
	}

	// If this is x86-64 and the Red Zone is not disabled, if we are a leaf
	// function, and use up to 128 bytes of stack space, don't have a frame
	// pointer, calls, or dynamic alloca then we do not need to adjust the
	// stack pointer (we fit in the Red Zone). We also check that we don't
	// push and pop from the stack.
	if (has128ByteRedZone(MF) &&
	!TRI->needsStackRealignment(MF) &&
	!MFI.hasVarSizedObjects() && // No dynamic alloca.
	!MFI.adjustsStack() && // No calls.
	!UseStackProbe && // No stack probes.
	!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
	!MF.shouldSplitStack()) { // Regular stack
	uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
	if (HasFP) MinSize += SlotSize;
	X86FI->setUsesRedZone(MinSize > 0 \|\| StackSize > 0);
	StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
	MFI.setStackSize(StackSize);
	}

	// Insert stack pointer adjustment for later moving of return addr. Only
	// applies to tail call optimized functions where the callee argument stack
	// size is bigger than the callers.
	if (TailCallReturnAddrDelta < 0) {
	BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
	/InEpilogue=/false)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// Mapping for machine moves:
	//
	// DST: VirtualFP AND
	// SRC: VirtualFP => DW_CFA_def_cfa_offset
	// ELSE => DW_CFA_def_cfa
	//
	// SRC: VirtualFP AND
	// DST: Register => DW_CFA_def_cfa_register
	//
	// ELSE
	// OFFSET < 0 => DW_CFA_offset_extended_sf
	// REG < 64 => DW_CFA_offset + Reg
	// ELSE => DW_CFA_offset_extended

	uint64_t NumBytes = 0;
	int stackGrowth = -SlotSize;

	// Find the funclet establisher parameter
	Register Establisher = X86::NoRegister;
	if (IsClrFunclet)
	Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
	else if (IsFunclet)
	Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;

	if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
	// Immediately spill establisher into the home slot.
	// The runtime cares about this.
	// MOV64mr %rdx, 16(%rsp)
	unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
	.addReg(Establisher)
	.setMIFlag(MachineInstr::FrameSetup);
	MBB.addLiveIn(Establisher);
	}

	if (HasFP) {
	assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");

	// Calculate required stack adjustment.
	uint64_t FrameSize = StackSize - SlotSize;
	// If required, include space for extra hidden slot for stashing base pointer.
	if (X86FI->getRestoreBasePointer())
	FrameSize += SlotSize;

	NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();

	// Callee-saved registers are pushed on stack before the stack is realigned.
	if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
	NumBytes = alignTo(NumBytes, MaxAlign);

	// Save EBP/RBP into the appropriate stack slot.
	BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
	.addReg(MachineFramePtr, RegState::Kill)
	.setMIFlag(MachineInstr::FrameSetup);

	if (NeedsDwarfCFI) {
	// Mark the place where EBP/RBP was saved.
	// Define the current CFA rule to use the provided offset.
	assert(StackSize);
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));

	// Change the rule for the FramePtr to be an "offset" rule.
	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
	nullptr, DwarfFramePtr, 2 * stackGrowth));
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
	.addImm(FramePtr)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	if (!IsWin64Prologue && !IsFunclet) {
	// Update EBP with the new base value.
	BuildMI(MBB, MBBI, DL,
	TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
	FramePtr)
	.addReg(StackPtr)
	.setMIFlag(MachineInstr::FrameSetup);

	if (NeedsDwarfCFI) {
	// Mark effective beginning of when frame pointer becomes valid.
	// Define the current CFA to use the EBP/RBP register.
	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
	nullptr, DwarfFramePtr));
	}

	if (NeedsWinFPO) {
	// .cv_fpo_setframe $FramePtr
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
	.addImm(FramePtr)
	.addImm(0)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}
	} else {
	assert(!IsFunclet && "funclets without FPs not yet implemented");
	NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
	}

	// Update the offset adjustment, which is mainly used by codeview to translate
	// from ESP to VFRAME relative local variable offsets.
	if (!IsFunclet) {
	if (HasFP && TRI->needsStackRealignment(MF))
	MFI.setOffsetAdjustment(-NumBytes);
	else
	MFI.setOffsetAdjustment(-StackSize);
	}

	// For EH funclets, only allocate enough space for outgoing calls. Save the
	// NumBytes value that we would've used for the parent frame.
	unsigned ParentFrameNumBytes = NumBytes;
	if (IsFunclet)
	NumBytes = getWinEHFuncletFrameSize(MF);

	// Skip the callee-saved push instructions.
	bool PushedRegs = false;
	int StackOffset = 2 * stackGrowth;

	while (MBBI != MBB.end() &&
	MBBI->getFlag(MachineInstr::FrameSetup) &&
	(MBBI->getOpcode() == X86::PUSH32r \|\|
	MBBI->getOpcode() == X86::PUSH64r)) {
	PushedRegs = true;
	Register Reg = MBBI->getOperand(0).getReg();
	++MBBI;

	if (!HasFP && NeedsDwarfCFI) {
	// Mark callee-saved push instruction.
	// Define the current CFA rule to use the provided offset.
	assert(StackSize);
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
	StackOffset += stackGrowth;
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
	.addImm(Reg)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	// Realign stack after we pushed callee-saved registers (so that we'll be
	// able to calculate their offsets from the frame pointer).
	// Don't do this for Win64, it needs to realign the stack after the prologue.
	if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
	assert(HasFP && "There should be a frame pointer if stack is realigned.");
	BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign))
	.addImm(MaxAlign)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	// If there is an SUB32ri of ESP immediately before this instruction, merge
	// the two. This can be the case when tail call elimination is enabled and
	// the callee has more arguments then the caller.
	NumBytes -= mergeSPUpdates(MBB, MBBI, true);

	// Adjust stack pointer: ESP -= numbytes.

	// Windows and cygwin/mingw require a prologue helper routine when allocating
	// more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
	// uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
	// stack and adjust the stack pointer in one go. The 64-bit version of
	// __chkstk is only responsible for probing the stack. The 64-bit prologue is
	// responsible for adjusting the stack pointer. Touching the stack at 4K
	// increments is necessary to ensure that the guard pages used by the OS
	// virtual memory manager are allocated in correct sequence.
	uint64_t AlignedNumBytes = NumBytes;
	if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
	AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
	if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
	assert(!X86FI->getUsesRedZone() &&
	"The Red Zone is not accounted for in stack probes");

	// Check whether EAX is livein for this block.
	bool isEAXAlive = isEAXLiveIn(MBB);

	if (isEAXAlive) {
	if (Is64Bit) {
	// Save RAX
	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
	.addReg(X86::RAX, RegState::Kill)
	.setMIFlag(MachineInstr::FrameSetup);
	} else {
	// Save EAX
	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
	.addReg(X86::EAX, RegState::Kill)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	if (Is64Bit) {
	// Handle the 64-bit Windows ABI case where we need to call __chkstk.
	// Function prologue is responsible for adjusting the stack pointer.
	int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
	if (isUInt<32>(Alloc)) {
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
	.addImm(Alloc)
	.setMIFlag(MachineInstr::FrameSetup);
	} else if (isInt<32>(Alloc)) {
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
	.addImm(Alloc)
	.setMIFlag(MachineInstr::FrameSetup);
	} else {
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
	.addImm(Alloc)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	} else {
	// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
	// We'll also use 4 already allocated bytes for EAX.
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
	.addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// Call __chkstk, __chkstk_ms, or __alloca.
	emitStackProbe(MF, MBB, MBBI, DL, true);

	if (isEAXAlive) {
	// Restore RAX/EAX
	MachineInstr *MI;
	if (Is64Bit)
	MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
	StackPtr, false, NumBytes - 8);
	else
	MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
	StackPtr, false, NumBytes - 4);
	MI->setFlag(MachineInstr::FrameSetup);
	MBB.insert(MBBI, MI);
	}
	} else if (NumBytes) {
	emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /InEpilogue=/false);
	}

	if (NeedsWinCFI && NumBytes) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	int SEHFrameOffset = 0;
	unsigned SPOrEstablisher;
	if (IsFunclet) {
	if (IsClrFunclet) {
	// The establisher parameter passed to a CLR funclet is actually a pointer
	// to the (mostly empty) frame of its nearest enclosing funclet; we have
	// to find the root function establisher frame by loading the PSPSym from
	// the intermediate frame.
	unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
	MachinePointerInfo NoInfo;
	MBB.addLiveIn(Establisher);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
	Establisher, false, PSPSlotOffset)
	.addMemOperand(MF.getMachineMemOperand(
	NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
	;
	// Save the root establisher back into the current funclet's (mostly
	// empty) frame, in case a sub-funclet or the GC needs it.
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
	false, PSPSlotOffset)
	.addReg(Establisher)
	.addMemOperand(
	MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore \|
	MachineMemOperand::MOVolatile,
	SlotSize, SlotSize));
	}
	SPOrEstablisher = Establisher;
	} else {
	SPOrEstablisher = StackPtr;
	}

	if (IsWin64Prologue && HasFP) {
	// Set RBP to a small fixed offset from RSP. In the funclet case, we base
	// this calculation on the incoming establisher, which holds the value of
	// RSP from the parent frame at the end of the prologue.
	SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
	if (SEHFrameOffset)
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
	SPOrEstablisher, false, SEHFrameOffset);
	else
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
	.addReg(SPOrEstablisher);

	// If this is not a funclet, emit the CFI describing our frame pointer.
	if (NeedsWinCFI && !IsFunclet) {
	assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
	.addImm(FramePtr)
	.addImm(SEHFrameOffset)
	.setMIFlag(MachineInstr::FrameSetup);
	if (isAsynchronousEHPersonality(Personality))
	MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
	}
	} else if (IsFunclet && STI.is32Bit()) {
	// Reset EBP / ESI to something good for funclets.
	MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
	// If we're a catch funclet, we can be returned to via catchret. Save ESP
	// into the registration node so that the runtime will restore it for us.
	if (!MBB.isCleanupFuncletEntry()) {
	assert(Personality == EHPersonality::MSVC_CXX);
	unsigned FrameReg;
	int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
	int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
	// ESP is the first field, so no extra displacement is needed.
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
	false, EHRegOffset)
	.addReg(X86::ESP);
	}
	}

	while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
	const MachineInstr &FrameInstr = *MBBI;
	++MBBI;

	if (NeedsWinCFI) {
	int FI;
	if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
	if (X86::FR64RegClass.contains(Reg)) {
	int Offset;
	unsigned IgnoredFrameReg;
	if (IsWin64Prologue && IsFunclet)
	Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
	else
	Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
	SEHFrameOffset;

	HasWinCFI = true;
	assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
	.addImm(Reg)
	.addImm(Offset)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}
	}
	}

	if (NeedsWinCFI && HasWinCFI)
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
	.setMIFlag(MachineInstr::FrameSetup);

	if (FnHasClrFunclet && !IsFunclet) {
	// Save the so-called Initial-SP (i.e. the value of the stack pointer
	// immediately after the prolog) into the PSPSlot so that funclets
	// and the GC can recover it.
	unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
	auto PSPInfo = MachinePointerInfo::getFixedStack(
	MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
	PSPSlotOffset)
	.addReg(StackPtr)
	.addMemOperand(MF.getMachineMemOperand(
	PSPInfo, MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile,
	SlotSize, SlotSize));
	}

	// Realign stack after we spilled callee-saved registers (so that we'll be
	// able to calculate their offsets from the frame pointer).
	// Win64 requires aligning the stack after the prologue.
	if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
	assert(HasFP && "There should be a frame pointer if stack is realigned.");
	BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
	}

	// We already dealt with stack realignment and funclets above.
	if (IsFunclet && STI.is32Bit())
	return;

	// If we need a base pointer, set it up here. It's whatever the value
	// of the stack pointer is at this point. Any variable size objects
	// will be allocated after this, so we can still use the base pointer
	// to reference locals.
	if (TRI->hasBasePointer(MF)) {
	// Update the base pointer with the current stack pointer.
	unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
	BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
	.addReg(SPOrEstablisher)
	.setMIFlag(MachineInstr::FrameSetup);
	if (X86FI->getRestoreBasePointer()) {
	// Stash value of base pointer. Saving RSP instead of EBP shortens
	// dependence chain. Used by SjLj EH.
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.addReg(SPOrEstablisher)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
	// Stash the value of the frame pointer relative to the base pointer for
	// Win32 EH. This supports Win32 EH, which does the inverse of the above:
	// it recovers the frame pointer from the base pointer rather than the
	// other way around.
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
	unsigned UsedReg;
	int Offset =
	getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
	assert(UsedReg == BasePtr);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
	.addReg(FramePtr)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	if (((!HasFP && NumBytes) \|\| PushedRegs) && NeedsDwarfCFI) {
	// Mark end of stack pointer adjustment.
	if (!HasFP && NumBytes) {
	// Define the current CFA rule to use the provided offset.
	assert(StackSize);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
	nullptr, -StackSize + stackGrowth));
	}

	// Emit DWARF info specifying the offsets of the callee-saved registers.
	emitCalleeSavedFrameMoves(MBB, MBBI, DL);
	}

	// X86 Interrupt handling function cannot assume anything about the direction
	// flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
	// in each prologue of interrupt handler function.
	//
	// FIXME: Create "cld" instruction only in these cases:
	// 1. The interrupt handling function uses any of the "rep" instructions.
	// 2. Interrupt handling function calls another function.
	//
	if (Fn.getCallingConv() == CallingConv::X86_INTR)
	BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
	.setMIFlag(MachineInstr::FrameSetup);

	// At this point we know if the function has WinCFI or not.
	MF.setHasWinCFI(HasWinCFI);
	}

	bool X86FrameLowering::canUseLEAForSPInEpilogue(
	const MachineFunction &MF) const {
	// We can't use LEA instructions for adjusting the stack pointer if we don't
	// have a frame pointer in the Win64 ABI. Only ADD instructions may be used
	// to deallocate the stack.
	// This means that we can use LEA for SP in two situations:
	// 1. We aren't using the Win64 ABI which means we are free to use LEA.
	// 2. We have a frame pointer which means we are permitted to use LEA.
	return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() \|\| hasFP(MF);
	}

	static bool isFuncletReturnInstr(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CATCHRET:
	case X86::CLEANUPRET:
	return true;
	default:
	return false;
	}
	llvm_unreachable("impossible");
	}

	// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
	// stack. It holds a pointer to the bottom of the root function frame. The
	// establisher frame pointer passed to a nested funclet may point to the
	// (mostly empty) frame of its parent funclet, but it will need to find
	// the frame of the root function to access locals. To facilitate this,
	// every funclet copies the pointer to the bottom of the root function
	// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
	// same offset for the PSPSym in the root function frame that's used in the
	// funclets' frames allows each funclet to dynamically accept any ancestor
	// frame as its establisher argument (the runtime doesn't guarantee the
	// immediate parent for some reason lost to history), and also allows the GC,
	// which uses the PSPSym for some bookkeeping, to find it in any funclet's
	// frame with only a single offset reported for the entire method.
	unsigned
	X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
	const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
	unsigned SPReg;
	int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
	/IgnoreSPUpdates/ true);
	assert(Offset >= 0 && SPReg == TRI->getStackRegister());
	return static_cast<unsigned>(Offset);
	}

	unsigned
	X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	// This is the size of the pushed CSRs.
	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
	// This is the size of callee saved XMMs.
	const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
	unsigned XMMSize = WinEHXMMSlotInfo.size() *
	TRI->getSpillSize(X86::VR128RegClass);
	// This is the amount of stack a funclet needs to allocate.
	unsigned UsedSize;
	EHPersonality Personality =
	classifyEHPersonality(MF.getFunction().getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	// CLR funclets need to hold enough space to include the PSPSym, at the
	// same offset from the stack pointer (immediately after the prolog) as it
	// resides at in the main function.
	UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
	} else {
	// Other funclets just need enough stack for outgoing call arguments.
	UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
	}
	// RBP is not included in the callee saved register block. After pushing RBP,
	// everything is 16 byte aligned. Everything we allocate before an outgoing
	// call must also be 16 byte aligned.
	unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
	// Subtract out the size of the callee saved registers. This is how much stack
	// each funclet will allocate.
	return FrameSizeMinusRBP + XMMSize - CSSize;
	}

	static bool isTailCallOpcode(unsigned Opc) {
	return Opc == X86::TCRETURNri \|\| Opc == X86::TCRETURNdi \|\|
	Opc == X86::TCRETURNmi \|\|
	Opc == X86::TCRETURNri64 \|\| Opc == X86::TCRETURNdi64 \|\|
	Opc == X86::TCRETURNmi64;
	}

	void X86FrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
	MachineBasicBlock::iterator MBBI = Terminator;
	DebugLoc DL;
	if (MBBI != MBB.end())
	DL = MBBI->getDebugLoc();
	// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
	const bool Is64BitILP32 = STI.isTarget64BitILP32();
	Register FramePtr = TRI->getFrameRegister(MF);
	unsigned MachineFramePtr =
	Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;

	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool NeedsWin64CFI =
	IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
	bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);

	// Get the number of bytes to allocate from the FrameInfo.
	uint64_t StackSize = MFI.getStackSize();
	uint64_t MaxAlign = calculateMaxStackAlign(MF);
	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
	bool HasFP = hasFP(MF);
	uint64_t NumBytes = 0;

	bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&
	!MF.getTarget().getTargetTriple().isOSWindows()) &&
	MF.needsFrameMoves();

	if (IsFunclet) {
	assert(HasFP && "EH funclets without FP not yet implemented");
	NumBytes = getWinEHFuncletFrameSize(MF);
	} else if (HasFP) {
	// Calculate required stack adjustment.
	uint64_t FrameSize = StackSize - SlotSize;
	NumBytes = FrameSize - CSSize;

	// Callee-saved registers were pushed on stack before the stack was
	// realigned.
	if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
	NumBytes = alignTo(FrameSize, MaxAlign);
	} else {
	NumBytes = StackSize - CSSize;
	}
	uint64_t SEHStackAllocAmt = NumBytes;

	if (HasFP) {
	// Pop EBP.
	BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
	MachineFramePtr)
	.setMIFlag(MachineInstr::FrameDestroy);
	if (NeedsDwarfCFI) {
	unsigned DwarfStackPtr =
	TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
	nullptr, DwarfStackPtr, -SlotSize));
	--MBBI;
	}
	}

	MachineBasicBlock::iterator FirstCSPop = MBBI;
	// Skip the callee-saved pop instructions.
	while (MBBI != MBB.begin()) {
	MachineBasicBlock::iterator PI = std::prev(MBBI);
	unsigned Opc = PI->getOpcode();

	if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
	if ((Opc != X86::POP32r \|\| !PI->getFlag(MachineInstr::FrameDestroy)) &&
	(Opc != X86::POP64r \|\| !PI->getFlag(MachineInstr::FrameDestroy)))
	break;
	FirstCSPop = PI;
	}

	--MBBI;
	}
	MBBI = FirstCSPop;

	if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
	emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);

	if (MBBI != MBB.end())
	DL = MBBI->getDebugLoc();

	// If there is an ADD32ri or SUB32ri of ESP immediately before this
	// instruction, merge the two instructions.
	if (NumBytes \|\| MFI.hasVarSizedObjects())
	NumBytes += mergeSPUpdates(MBB, MBBI, true);

	// If dynamic alloca is used, then reset esp to point to the last callee-saved
	// slot before popping them off! Same applies for the case, when stack was
	// realigned. Don't do this if this was a funclet epilogue, since the funclets
	// will not do realignment or dynamic stack allocation.
	if ((TRI->needsStackRealignment(MF) \|\| MFI.hasVarSizedObjects()) &&
	!IsFunclet) {
	if (TRI->needsStackRealignment(MF))
	MBBI = FirstCSPop;
	unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
	uint64_t LEAAmount =
	IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;

	// There are only two legal forms of epilogue:
	// - add SEHAllocationSize, %rsp
	// - lea SEHAllocationSize(%FramePtr), %rsp
	//
	// 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
	// However, we may use this sequence if we have a frame pointer because the
	// effects of the prologue can safely be undone.
	if (LEAAmount != 0) {
	unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
	FramePtr, false, LEAAmount);
	--MBBI;
	} else {
	unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
	BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
	.addReg(FramePtr);
	--MBBI;
	}
	} else if (NumBytes) {
	// Adjust stack pointer back: ESP += numbytes.
	emitSPUpdate(MBB, MBBI, DL, NumBytes, /InEpilogue=/true);
	if (!hasFP(MF) && NeedsDwarfCFI) {
	// Define the current CFA rule to use the provided offset.
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
	nullptr, -CSSize - SlotSize));
	}
	--MBBI;
	}

	// Windows unwinder will not invoke function's exception handler if IP is
	// either in prologue or in epilogue. This behavior causes a problem when a
	// call immediately precedes an epilogue, because the return address points
	// into the epilogue. To cope with that, we insert an epilogue marker here,
	// then replace it with a 'nop' if it ends up immediately after a CALL in the
	// final emitted code.
	if (NeedsWin64CFI && MF.hasWinCFI())
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));

	if (!hasFP(MF) && NeedsDwarfCFI) {
	MBBI = FirstCSPop;
	int64_t Offset = -CSSize - SlotSize;
	// Mark callee-saved pop instruction.
	// Define the current CFA rule to use the provided offset.
	while (MBBI != MBB.end()) {
	MachineBasicBlock::iterator PI = MBBI;
	unsigned Opc = PI->getOpcode();
	++MBBI;
	if (Opc == X86::POP32r \|\| Opc == X86::POP64r) {
	Offset += SlotSize;
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
	}
	}
	}

	if (Terminator == MBB.end() \|\| !isTailCallOpcode(Terminator->getOpcode())) {
	// Add the return addr area delta back since we are not tail calling.
	int Offset = -1 * X86FI->getTCReturnAddrDelta();
	assert(Offset >= 0 && "TCDelta should never be positive");
	if (Offset) {
	// Check for possible merge with preceding ADD instruction.
	Offset += mergeSPUpdates(MBB, Terminator, true);
	emitSPUpdate(MBB, Terminator, DL, Offset, /InEpilogue=/true);
	}
	}
	}

	int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
	unsigned &FrameReg) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	bool IsFixed = MFI.isFixedObjectIndex(FI);
	// We can't calculate offset from frame pointer if the stack is realigned,
	// so enforce usage of stack/base pointer. The base pointer is used when we
	// have dynamic allocas in addition to dynamic realignment.
	if (TRI->hasBasePointer(MF))
	FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
	else if (TRI->needsStackRealignment(MF))
	FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
	else
	FrameReg = TRI->getFrameRegister(MF);

	// Offset will hold the offset from the stack pointer at function entry to the
	// object.
	// We need to factor in additional offsets applied during the prologue to the
	// frame, base, and stack pointer depending on which is used.
	int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
	uint64_t StackSize = MFI.getStackSize();
	bool HasFP = hasFP(MF);
	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	int64_t FPDelta = 0;

	// In an x86 interrupt, remove the offset we added to account for the return
	// address from any stack object allocated in the caller's frame. Interrupts
	// do not have a standard return address. Fixed objects in the current frame,
	// such as SSE register spills, should not get this treatment.
	if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
	Offset >= 0) {
	Offset += getOffsetOfLocalArea();
	}

	if (IsWin64Prologue) {
	assert(!MFI.hasCalls() \|\| (StackSize % 16) == 8);

	// Calculate required stack adjustment.
	uint64_t FrameSize = StackSize - SlotSize;
	// If required, include space for extra hidden slot for stashing base pointer.
	if (X86FI->getRestoreBasePointer())
	FrameSize += SlotSize;
	uint64_t NumBytes = FrameSize - CSSize;

	uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
	if (FI && FI == X86FI->getFAIndex())
	return -SEHFrameOffset;

	// FPDelta is the offset from the "traditional" FP location of the old base
	// pointer followed by return address and the location required by the
	// restricted Win64 prologue.
	// Add FPDelta to all offsets below that go through the frame pointer.
	FPDelta = FrameSize - SEHFrameOffset;
	assert((!MFI.hasCalls() \|\| (FPDelta % 16) == 0) &&
	"FPDelta isn't aligned per the Win64 ABI!");
	}


	if (TRI->hasBasePointer(MF)) {
	assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
	if (FI < 0) {
	// Skip the saved EBP.
	return Offset + SlotSize + FPDelta;
	} else {
	assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
	return Offset + StackSize;
	}
	} else if (TRI->needsStackRealignment(MF)) {
	if (FI < 0) {
	// Skip the saved EBP.
	return Offset + SlotSize + FPDelta;
	} else {
	assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
	return Offset + StackSize;
	}
	// FIXME: Support tail calls
	} else {
	if (!HasFP)
	return Offset + StackSize;

	// Skip the saved EBP.
	Offset += SlotSize;

	// Skip the RETADDR move area
	int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
	if (TailCallReturnAddrDelta < 0)
	Offset -= TailCallReturnAddrDelta;
	}

	return Offset + FPDelta;
	}

	int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
	int FI, unsigned &FrameReg) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
	const auto it = WinEHXMMSlotInfo.find(FI);

	if (it == WinEHXMMSlotInfo.end())
	return getFrameIndexReference(MF, FI, FrameReg);

	FrameReg = TRI->getStackRegister();
	return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
	}

	int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
	int FI, unsigned &FrameReg,
	int Adjustment) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	FrameReg = TRI->getStackRegister();
	return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
	}

	int
	X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
	int FI, unsigned &FrameReg,
	bool IgnoreSPUpdates) const {

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	// Does not include any dynamic realign.
	const uint64_t StackSize = MFI.getStackSize();
	// LLVM arranges the stack as follows:
	// ...
	// ARG2
	// ARG1
	// RETADDR
	// PUSH RBP <-- RBP points here
	// PUSH CSRs
	// ~~~~~~~ <-- possible stack realignment (non-win64)
	// ...
	// STACK OBJECTS
	// ... <-- RSP after prologue points here
	// ~~~~~~~ <-- possible stack realignment (win64)
	//
	// if (hasVarSizedObjects()):
	// ... <-- "base pointer" (ESI/RBX) points here
	// DYNAMIC ALLOCAS
	// ... <-- RSP points here
	//
	// Case 1: In the simple case of no stack realignment and no dynamic
	// allocas, both "fixed" stack objects (arguments and CSRs) are addressable
	// with fixed offsets from RSP.
	//
	// Case 2: In the case of stack realignment with no dynamic allocas, fixed
	// stack objects are addressed with RBP and regular stack objects with RSP.
	//
	// Case 3: In the case of dynamic allocas and stack realignment, RSP is used
	// to address stack arguments for outgoing calls and nothing else. The "base
	// pointer" points to local variables, and RBP points to fixed objects.
	//
	// In cases 2 and 3, we can only answer for non-fixed stack objects, and the
	// answer we give is relative to the SP after the prologue, and not the
	// SP in the middle of the function.

	if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
	!STI.isTargetWin64())
	return getFrameIndexReference(MF, FI, FrameReg);

	// If !hasReservedCallFrame the function might have SP adjustement in the
	// body. So, even though the offset is statically known, it depends on where
	// we are in the function.
	if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
	return getFrameIndexReference(MF, FI, FrameReg);

	// We don't handle tail calls, and shouldn't be seeing them either.
	assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
	"we don't handle this case!");

	// This is how the math works out:
	//
	// %rsp grows (i.e. gets lower) left to right. Each box below is
	// one word (eight bytes). Obj0 is the stack slot we're trying to
	// get to.
	//
	// ----------------------------------
	// \| BP \| Obj0 \| Obj1 \| ... \| ObjN \|
	// ----------------------------------
	// ^ ^ ^ ^
	// A B C E
	//
	// A is the incoming stack pointer.
	// (B - A) is the local area offset (-8 for x86-64) [1]
	// (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
	//
	// \|(E - B)\| is the StackSize (absolute value, positive). For a
	// stack that grown down, this works out to be (B - E). [3]
	//
	// E is also the value of %rsp after stack has been set up, and we
	// want (C - E) -- the value we can add to %rsp to get to Obj0. Now
	// (C - E) == (C - A) - (B - A) + (B - E)
	// { Using [1], [2] and [3] above }
	// == getObjectOffset - LocalAreaOffset + StackSize

	return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
	}

	bool X86FrameLowering::assignCalleeSavedSpillSlots(
	MachineFunction &MF, const TargetRegisterInfo *TRI,
	std::vector<CalleeSavedInfo> &CSI) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();

	unsigned CalleeSavedFrameSize = 0;
	unsigned XMMCalleeSavedFrameSize = 0;
	auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
	int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();

	int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();

	if (TailCallReturnAddrDelta < 0) {
	// create RETURNADDR area
	// arg
	// arg
	// RETADDR
	// { ...
	// RETADDR area
	// ...
	// }
	// [EBP]
	MFI.CreateFixedObject(-TailCallReturnAddrDelta,
	TailCallReturnAddrDelta - SlotSize, true);
	}

	// Spill the BasePtr if it's used.
	if (this->TRI->hasBasePointer(MF)) {
	// Allocate a spill slot for EBP if we have a base pointer and EH funclets.
	if (MF.hasEHFunclets()) {
	int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
	X86FI->setHasSEHFramePtrSave(true);
	X86FI->setSEHFramePtrSaveIndex(FI);
	}
	}

	if (hasFP(MF)) {
	// emitPrologue always spills frame register the first thing.
	SpillSlotOffset -= SlotSize;
	MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);

	// Since emitPrologue and emitEpilogue will handle spilling and restoring of
	// the frame register, we can delete it from CSI list and not have to worry
	// about avoiding it later.
	Register FPReg = TRI->getFrameRegister(MF);
	for (unsigned i = 0; i < CSI.size(); ++i) {
	if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
	CSI.erase(CSI.begin() + i);
	break;
	}
	}
	}

	// Assign slots for GPRs. It increases frame size.
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i - 1].getReg();

	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
	continue;

	SpillSlotOffset -= SlotSize;
	CalleeSavedFrameSize += SlotSize;

	int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
	CSI[i - 1].setFrameIdx(SlotIndex);
	}

	X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
	MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);

	// Assign slots for XMMs.
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i - 1].getReg();
	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
	continue;

	// If this is k-register make sure we lookup via the largest legal type.
	MVT VT = MVT::Other;
	if (X86::VK16RegClass.contains(Reg))
	VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;

	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
	unsigned Size = TRI->getSpillSize(*RC);
	unsigned Align = TRI->getSpillAlignment(*RC);
	// ensure alignment
	assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
	SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);

	// spill into slot
	SpillSlotOffset -= Size;
	int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
	CSI[i - 1].setFrameIdx(SlotIndex);
	MFI.ensureMaxAlignment(Align);

	// Save the start offset and size of XMM in stack frame for funclets.
	if (X86::VR128RegClass.contains(Reg)) {
	WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
	XMMCalleeSavedFrameSize += Size;
	}
	}

	return true;
	}

	bool X86FrameLowering::spillCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	const std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI) const {
	DebugLoc DL = MBB.findDebugLoc(MI);

	// Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
	// for us, and there are no XMM CSRs on Win32.
	if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
	return true;

	// Push GPRs. It increases frame size.
	const MachineFunction &MF = *MBB.getParent();
	unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i - 1].getReg();

	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
	continue;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	bool isLiveIn = MRI.isLiveIn(Reg);
	if (!isLiveIn)
	MBB.addLiveIn(Reg);

	// Decide whether we can add a kill flag to the use.
	bool CanKill = !isLiveIn;
	// Check if any subregister is live-in
	if (CanKill) {
	for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
	if (MRI.isLiveIn(*AReg)) {
	CanKill = false;
	break;
	}
	}
	}

	// Do not set a kill flag on values that are also marked as live-in. This
	// happens with the @llvm-returnaddress intrinsic and with arguments
	// passed in callee saved registers.
	// Omitting the kill flags is conservatively correct even if the live-in
	// is not used after all.
	BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// Make XMM regs spilled. X86 does not have ability of push/pop XMM.
	// It can be done by spilling XMMs to stack frame.
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i-1].getReg();
	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
	continue;

	// If this is k-register make sure we lookup via the largest legal type.
	MVT VT = MVT::Other;
	if (X86::VK16RegClass.contains(Reg))
	VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;

	// Add the callee-saved register as live-in. It's killed at the spill.
	MBB.addLiveIn(Reg);
	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);

	TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
	TRI);
	--MI;
	MI->setFlag(MachineInstr::FrameSetup);
	++MI;
	}

	return true;
	}

	void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineInstr *CatchRet) const {
	// SEH shouldn't use catchret.
	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
	MBB.getParent()->getFunction().getPersonalityFn())) &&
	"SEH should not use CATCHRET");
	DebugLoc DL = CatchRet->getDebugLoc();
	MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();

	// Fill EAX/RAX with the address of the target block.
	if (STI.is64Bit()) {
	// LEA64r CatchRetTarget(%rip), %rax
	BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(CatchRetTarget)
	.addReg(0);
	} else {
	// MOV32ri $CatchRetTarget, %eax
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
	.addMBB(CatchRetTarget);
	}

	// Record that we've taken the address of CatchRetTarget and no longer just
	// reference it in a terminator.
	CatchRetTarget->setHasAddressTaken();
	}

	bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI) const {
	if (CSI.empty())
	return false;

	if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
	// Don't restore CSRs in 32-bit EH funclets. Matches
	// spillCalleeSavedRegisters.
	if (STI.is32Bit())
	return true;
	// Don't restore CSRs before an SEH catchret. SEH except blocks do not form
	// funclets. emitEpilogue transforms these to normal jumps.
	if (MI->getOpcode() == X86::CATCHRET) {
	const Function &F = MBB.getParent()->getFunction();
	bool IsSEH = isAsynchronousEHPersonality(
	classifyEHPersonality(F.getPersonalityFn()));
	if (IsSEH)
	return true;
	}
	}

	DebugLoc DL = MBB.findDebugLoc(MI);

	// Reload XMMs from stack frame.
	for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
	unsigned Reg = CSI[i].getReg();
	if (X86::GR64RegClass.contains(Reg) \|\|
	X86::GR32RegClass.contains(Reg))
	continue;

	// If this is k-register make sure we lookup via the largest legal type.
	MVT VT = MVT::Other;
	if (X86::VK16RegClass.contains(Reg))
	VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;

	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
	TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
	}

	// POP GPRs.
	unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
	for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
	unsigned Reg = CSI[i].getReg();
	if (!X86::GR64RegClass.contains(Reg) &&
	!X86::GR32RegClass.contains(Reg))
	continue;

	BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	return true;
	}

	void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);

	// Spill the BasePtr if it's used.
	if (TRI->hasBasePointer(MF)){
	Register BasePtr = TRI->getBaseRegister();
	if (STI.isTarget64BitILP32())
	BasePtr = getX86SubSuperRegister(BasePtr, 64);
	SavedRegs.set(BasePtr);
	}
	}

	static bool
	HasNestArgument(const MachineFunction *MF) {
	const Function &F = MF->getFunction();
	for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
	I != E; I++) {
	if (I->hasNestAttr() && !I->use_empty())
	return true;
	}
	return false;
	}

	/// GetScratchRegister - Get a temp register for performing work in the
	/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
	/// and the properties of the function either one or two registers will be
	/// needed. Set primary to true for the first register, false for the second.
	static unsigned
	GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
	CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();

	// Erlang stuff.
	if (CallingConvention == CallingConv::HiPE) {
	if (Is64Bit)
	return Primary ? X86::R14 : X86::R13;
	else
	return Primary ? X86::EBX : X86::EDI;
	}

	if (Is64Bit) {
	if (IsLP64)
	return Primary ? X86::R11 : X86::R12;
	else
	return Primary ? X86::R11D : X86::R12D;
	}

	bool IsNested = HasNestArgument(&MF);

	if (CallingConvention == CallingConv::X86_FastCall \|\|
	CallingConvention == CallingConv::Fast \|\|
	CallingConvention == CallingConv::Tail) {
	if (IsNested)
	report_fatal_error("Segmented stacks does not support fastcall with "
	"nested function.");
	return Primary ? X86::EAX : X86::ECX;
	}
	if (IsNested)
	return Primary ? X86::EDX : X86::EAX;
	return Primary ? X86::ECX : X86::EAX;
	}

	// The stack limit in the TCB is set to this many bytes above the actual stack
	// limit.
	static const uint64_t kSplitStackAvailable = 256;

	void X86FrameLowering::adjustForSegmentedStacks(
	MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	uint64_t StackSize;
	unsigned TlsReg, TlsOffset;
	DebugLoc DL;

	// To support shrink-wrapping we would need to insert the new blocks
	// at the right place and update the branches to PrologueMBB.
	assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");

	unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
	assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
	"Scratch register is live-in");

	if (MF.getFunction().isVarArg())
	report_fatal_error("Segmented stacks do not support vararg functions.");
	if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
	!STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
	!STI.isTargetDragonFly())
	report_fatal_error("Segmented stacks not supported on this platform.");

	// Eventually StackSize will be calculated by a link-time pass; which will
	// also decide whether checking code needs to be injected into this particular
	// prologue.
	StackSize = MFI.getStackSize();

	// Do not generate a prologue for leaf functions with a stack of size zero.
	// For non-leaf functions we have to allow for the possibility that the
	// callis to a non-split function, as in PR37807. This function could also
	// take the address of a non-split function. When the linker tries to adjust
	// its non-existent prologue, it would fail with an error. Mark the object
	// file so that such failures are not errors. See this Go language bug-report
	// https://go-review.googlesource.com/c/go/+/148819/
	if (StackSize == 0 && !MFI.hasTailCall()) {
	MF.getMMI().setHasNosplitStack(true);
	return;
	}

	MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
	MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	bool IsNested = false;

	// We need to know if the function has a nest argument only in 64 bit mode.
	if (Is64Bit)
	IsNested = HasNestArgument(&MF);

	// The MOV R10, RAX needs to be in a different block, since the RET we emit in
	// allocMBB needs to be last (terminating) instruction.

	for (const auto &LI : PrologueMBB.liveins()) {
	allocMBB->addLiveIn(LI);
	checkMBB->addLiveIn(LI);
	}

	if (IsNested)
	allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);

	MF.push_front(allocMBB);
	MF.push_front(checkMBB);

	// When the frame size is less than 256 we just compare the stack
	// boundary directly to the value of the stack pointer, per gcc.
	bool CompareStackPointer = StackSize < kSplitStackAvailable;

	// Read the limit off the current stacklet off the stack_guard location.
	if (Is64Bit) {
	if (STI.isTargetLinux()) {
	TlsReg = X86::FS;
	TlsOffset = IsLP64 ? 0x70 : 0x40;
	} else if (STI.isTargetDarwin()) {
	TlsReg = X86::GS;
	TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
	} else if (STI.isTargetWin64()) {
	TlsReg = X86::GS;
	TlsOffset = 0x28; // pvArbitrary, reserved for application use
	} else if (STI.isTargetFreeBSD()) {
	TlsReg = X86::FS;
	TlsOffset = 0x18;
	} else if (STI.isTargetDragonFly()) {
	TlsReg = X86::FS;
	TlsOffset = 0x20; // use tls_tcb.tcb_segstack
	} else {
	report_fatal_error("Segmented stacks not supported on this platform.");
	}

	if (CompareStackPointer)
	ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
	else
	BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
	.addImm(1).addReg(0).addImm(-StackSize).addReg(0);

	BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
	} else {
	if (STI.isTargetLinux()) {
	TlsReg = X86::GS;
	TlsOffset = 0x30;
	} else if (STI.isTargetDarwin()) {
	TlsReg = X86::GS;
	TlsOffset = 0x48 + 90*4;
	} else if (STI.isTargetWin32()) {
	TlsReg = X86::FS;
	TlsOffset = 0x14; // pvArbitrary, reserved for application use
	} else if (STI.isTargetDragonFly()) {
	TlsReg = X86::FS;
	TlsOffset = 0x10; // use tls_tcb.tcb_segstack
	} else if (STI.isTargetFreeBSD()) {
	report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
	} else {
	report_fatal_error("Segmented stacks not supported on this platform.");
	}

	if (CompareStackPointer)
	ScratchReg = X86::ESP;
	else
	BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
	.addImm(1).addReg(0).addImm(-StackSize).addReg(0);

	if (STI.isTargetLinux() \|\| STI.isTargetWin32() \|\| STI.isTargetWin64() \|\|
	STI.isTargetDragonFly()) {
	BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
	.addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
	} else if (STI.isTargetDarwin()) {

	// TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
	unsigned ScratchReg2;
	bool SaveScratch2;
	if (CompareStackPointer) {
	// The primary scratch register is available for holding the TLS offset.
	ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
	SaveScratch2 = false;
	} else {
	// Need to use a second register to hold the TLS offset
	ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);

	// Unfortunately, with fastcc the second scratch register may hold an
	// argument.
	SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
	}

	// If Scratch2 is live-in then it needs to be saved.
	assert((!MF.getRegInfo().isLiveIn(ScratchReg2) \|\| SaveScratch2) &&
	"Scratch register is live-in and not saved");

	if (SaveScratch2)
	BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
	.addReg(ScratchReg2, RegState::Kill);

	BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
	.addImm(TlsOffset);
	BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
	.addReg(ScratchReg)
	.addReg(ScratchReg2).addImm(1).addReg(0)
	.addImm(0)
	.addReg(TlsReg);

	if (SaveScratch2)
	BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
	}
	}

	// This jump is taken if SP >= (Stacklet Limit + Stack Space required).
	// It jumps to normal execution of the function body.
	BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A);

	// On 32 bit we first push the arguments size and then the frame size. On 64
	// bit, we pass the stack frame size in r10 and the argument size in r11.
	if (Is64Bit) {
	// Functions with nested arguments use R10, so it needs to be saved across
	// the call to _morestack

	const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
	const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
	const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
	const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
	const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;

	if (IsNested)
	BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);

	BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
	.addImm(StackSize);
	BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
	.addImm(X86FI->getArgumentStackSize());
	} else {
	BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
	.addImm(X86FI->getArgumentStackSize());
	BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
	.addImm(StackSize);
	}

	// __morestack is in libgcc
	if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
	// Under the large code model, we cannot assume that __morestack lives
	// within 2^31 bytes of the call site, so we cannot use pc-relative
	// addressing. We cannot perform the call via a temporary register,
	// as the rax register may be used to store the static chain, and all
	// other suitable registers may be either callee-save or used for
	// parameter passing. We cannot use the stack at this point either
	// because __morestack manipulates the stack directly.
	//
	// To avoid these issues, perform an indirect call via a read-only memory
	// location containing the address.
	//
	// This solution is not perfect, as it assumes that the .rodata section
	// is laid out within 2^31 bytes of each function body, but this seems
	// to be sufficient for JIT.
	// FIXME: Add retpoline support and remove the error here..
	- if (STI.useRetpolineIndirectCalls())
	+ if (STI.useIndirectThunkCalls())
	report_fatal_error("Emitting morestack calls on 64-bit with the large "
	- "code model and retpoline not yet implemented.");
	+ "code model and thunks not yet implemented.");
	BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addExternalSymbol("__morestack_addr")
	.addReg(0);
	MF.getMMI().setUsesMorestackAddr(true);
	} else {
	if (Is64Bit)
	BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack");
	else
	BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack");
	}

	if (IsNested)
	BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
	else
	BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));

	allocMBB->addSuccessor(&PrologueMBB);

	checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());
	checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());

	#ifdef EXPENSIVE_CHECKS
	MF.verify();
	#endif
	}

	/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
	/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
	/// to fields it needs, through a named metadata node "hipe.literals" containing
	/// name-value pairs.
	static unsigned getHiPELiteral(
	NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
	for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
	MDNode *Node = HiPELiteralsMD->getOperand(i);
	if (Node->getNumOperands() != 2) continue;
	MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
	ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
	if (!NodeName \|\| !NodeVal) continue;
	ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
	if (ValConst && NodeName->getString() == LiteralName) {
	return ValConst->getZExtValue();
	}
	}

	report_fatal_error("HiPE literal " + LiteralName
	+ " required but not provided");
	}

	// Return true if there are no non-ehpad successors to MBB and there are no
	// non-meta instructions between MBBI and MBB.end().
	static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
	MachineBasicBlock::const_iterator MBBI) {
	return std::all_of(
	MBB.succ_begin(), MBB.succ_end(),
	[](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
	std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
	return MI.isMetaInstruction();
	});
	}

	/// Erlang programs may need a special prologue to handle the stack size they
	/// might need at runtime. That is because Erlang/OTP does not implement a C
	/// stack but uses a custom implementation of hybrid stack/heap architecture.
	/// (for more information see Eric Stenman's Ph.D. thesis:
	/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
	///
	/// CheckStack:
	/// temp0 = sp - MaxStack
	/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
	/// OldStart:
	/// ...
	/// IncStack:
	/// call inc_stack # doubles the stack space
	/// temp0 = sp - MaxStack
	/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
	void X86FrameLowering::adjustForHiPEPrologue(
	MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	DebugLoc DL;

	// To support shrink-wrapping we would need to insert the new blocks
	// at the right place and update the branches to PrologueMBB.
	assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");

	// HiPE-specific values
	NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
	->getNamedMetadata("hipe.literals");
	if (!HiPELiteralsMD)
	report_fatal_error(
	"Can't generate HiPE prologue without runtime parameters");
	const unsigned HipeLeafWords
	= getHiPELiteral(HiPELiteralsMD,
	Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
	const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
	const unsigned Guaranteed = HipeLeafWords * SlotSize;
	unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ?
	MF.getFunction().arg_size() - CCRegisteredArgs : 0;
	unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;

	assert(STI.isTargetLinux() &&
	"HiPE prologue is only supported on Linux operating systems.");

	// Compute the largest caller's frame that is needed to fit the callees'
	// frames. This 'MaxStack' is computed from:
	//
	// a) the fixed frame size, which is the space needed for all spilled temps,
	// b) outgoing on-stack parameter areas, and
	// c) the minimum stack space this function needs to make available for the
	// functions it calls (a tunable ABI property).
	if (MFI.hasCalls()) {
	unsigned MoreStackForCalls = 0;

	for (auto &MBB : MF) {
	for (auto &MI : MBB) {
	if (!MI.isCall())
	continue;

	// Get callee operand.
	const MachineOperand &MO = MI.getOperand(0);

	// Only take account of global function calls (no closures etc.).
	if (!MO.isGlobal())
	continue;

	const Function *F = dyn_cast<Function>(MO.getGlobal());
	if (!F)
	continue;

	// Do not update 'MaxStack' for primitive and built-in functions
	// (encoded with names either starting with "erlang."/"bif_" or not
	// having a ".", such as a simple <Module>.<Function>.<Arity>, or an
	// "_", such as the BIF "suspend_0") as they are executed on another
	// stack.
	if (F->getName().find("erlang.") != StringRef::npos \|\|
	F->getName().find("bif_") != StringRef::npos \|\|
	F->getName().find_first_of("._") == StringRef::npos)
	continue;

	unsigned CalleeStkArity =
	F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
	if (HipeLeafWords - 1 > CalleeStkArity)
	MoreStackForCalls = std::max(MoreStackForCalls,
	(HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
	}
	}
	MaxStack += MoreStackForCalls;
	}

	// If the stack frame needed is larger than the guaranteed then runtime checks
	// and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
	if (MaxStack > Guaranteed) {
	MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
	MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();

	for (const auto &LI : PrologueMBB.liveins()) {
	stackCheckMBB->addLiveIn(LI);
	incStackMBB->addLiveIn(LI);
	}

	MF.push_front(incStackMBB);
	MF.push_front(stackCheckMBB);

	unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
	unsigned LEAop, CMPop, CALLop;
	SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
	if (Is64Bit) {
	SPReg = X86::RSP;
	PReg = X86::RBP;
	LEAop = X86::LEA64r;
	CMPop = X86::CMP64rm;
	CALLop = X86::CALL64pcrel32;
	} else {
	SPReg = X86::ESP;
	PReg = X86::EBP;
	LEAop = X86::LEA32r;
	CMPop = X86::CMP32rm;
	CALLop = X86::CALLpcrel32;
	}

	ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
	assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
	"HiPE prologue scratch register is live-in");

	// Create new MBB for StackCheck:
	addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
	SPReg, false, -MaxStack);
	// SPLimitOffset is in a fixed heap location (pointed by BP).
	addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
	.addReg(ScratchReg), PReg, false, SPLimitOffset);
	BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE);

	// Create new MBB for IncStack:
	BuildMI(incStackMBB, DL, TII.get(CALLop)).
	addExternalSymbol("inc_stack_0");
	addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
	SPReg, false, -MaxStack);
	addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
	.addReg(ScratchReg), PReg, false, SPLimitOffset);
	BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE);

	stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
	stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
	incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
	incStackMBB->addSuccessor(incStackMBB, {1, 100});
	}
	#ifdef EXPENSIVE_CHECKS
	MF.verify();
	#endif
	}

	bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	int Offset) const {

	if (Offset <= 0)
	return false;

	if (Offset % SlotSize)
	return false;

	int NumPops = Offset / SlotSize;
	// This is only worth it if we have at most 2 pops.
	if (NumPops != 1 && NumPops != 2)
	return false;

	// Handle only the trivial case where the adjustment directly follows
	// a call. This is the most common one, anyway.
	if (MBBI == MBB.begin())
	return false;
	MachineBasicBlock::iterator Prev = std::prev(MBBI);
	if (!Prev->isCall() \|\| !Prev->getOperand(1).isRegMask())
	return false;

	unsigned Regs[2];
	unsigned FoundRegs = 0;

	auto &MRI = MBB.getParent()->getRegInfo();
	auto RegMask = Prev->getOperand(1);

	auto &RegClass =
	Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
	// Try to find up to NumPops free registers.
	for (auto Candidate : RegClass) {

	// Poor man's liveness:
	// Since we're immediately after a call, any register that is clobbered
	// by the call and not defined by it can be considered dead.
	if (!RegMask.clobbersPhysReg(Candidate))
	continue;

	// Don't clobber reserved registers
	if (MRI.isReserved(Candidate))
	continue;

	bool IsDef = false;
	for (const MachineOperand &MO : Prev->implicit_operands()) {
	if (MO.isReg() && MO.isDef() &&
	TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
	IsDef = true;
	break;
	}
	}

	if (IsDef)
	continue;

	Regs[FoundRegs++] = Candidate;
	if (FoundRegs == (unsigned)NumPops)
	break;
	}

	if (FoundRegs == 0)
	return false;

	// If we found only one free register, but need two, reuse the same one twice.
	while (FoundRegs < (unsigned)NumPops)
	Regs[FoundRegs++] = Regs[0];

	for (int i = 0; i < NumPops; ++i)
	BuildMI(MBB, MBBI, DL,
	TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);

	return true;
	}

	MachineBasicBlock::iterator X86FrameLowering::
	eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	bool reserveCallFrame = hasReservedCallFrame(MF);
	unsigned Opcode = I->getOpcode();
	bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
	DebugLoc DL = I->getDebugLoc();
	uint64_t Amount = TII.getFrameSize(*I);
	uint64_t InternalAmt = (isDestroy \|\| Amount) ? TII.getFrameAdjustment(*I) : 0;
	I = MBB.erase(I);
	auto InsertPos = skipDebugInstructionsForward(I, MBB.end());

	if (!reserveCallFrame) {
	// If the stack pointer can be changed after prologue, turn the
	// adjcallstackup instruction into a 'sub ESP, <amt>' and the
	// adjcallstackdown instruction into 'add ESP, <amt>'

	// We need to keep the stack aligned properly. To do this, we round the
	// amount of space needed for the outgoing arguments up to the next
	// alignment boundary.
	unsigned StackAlign = getStackAlignment();
	Amount = alignTo(Amount, StackAlign);

	const Function &F = MF.getFunction();
	bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();

	// If we have any exception handlers in this function, and we adjust
	// the SP before calls, we may need to indicate this to the unwinder
	// using GNU_ARGS_SIZE. Note that this may be necessary even when
	// Amount == 0, because the preceding function may have set a non-0
	// GNU_ARGS_SIZE.
	// TODO: We don't need to reset this between subsequent functions,
	// if it didn't change.
	bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();

	if (HasDwarfEHHandlers && !isDestroy &&
	MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
	BuildCFI(MBB, InsertPos, DL,
	MCCFIInstruction::createGnuArgsSize(nullptr, Amount));

	if (Amount == 0)
	return I;

	// Factor out the amount that gets handled inside the sequence
	// (Pushes of argument for frame setup, callee pops for frame destroy)
	Amount -= InternalAmt;

	// TODO: This is needed only if we require precise CFA.
	// If this is a callee-pop calling convention, emit a CFA adjust for
	// the amount the callee popped.
	if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
	BuildCFI(MBB, InsertPos, DL,
	MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));

	// Add Amount to SP to destroy a frame, or subtract to setup.
	int64_t StackAdjustment = isDestroy ? Amount : -Amount;

	if (StackAdjustment) {
	// Merge with any previous or following adjustment instruction. Note: the
	// instructions merged with here do not have CFI, so their stack
	// adjustments do not feed into CfaAdjustment.
	StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
	StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);

	if (StackAdjustment) {
	if (!(F.hasMinSize() &&
	adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
	BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
	/InEpilogue=/false);
	}
	}

	if (DwarfCFI && !hasFP(MF)) {
	// If we don't have FP, but need to generate unwind information,
	// we need to set the correct CFA offset after the stack adjustment.
	// How much we adjust the CFA offset depends on whether we're emitting
	// CFI only for EH purposes or for debugging. EH only requires the CFA
	// offset to be correct at each call site, while for debugging we want
	// it to be more precise.

	int64_t CfaAdjustment = -StackAdjustment;
	// TODO: When not using precise CFA, we also need to adjust for the
	// InternalAmt here.
	if (CfaAdjustment) {
	BuildCFI(MBB, InsertPos, DL,
	MCCFIInstruction::createAdjustCfaOffset(nullptr,
	CfaAdjustment));
	}
	}

	return I;
	}

	if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) {
	// If we are performing frame pointer elimination and if the callee pops
	// something off the stack pointer, add it back. We do this until we have
	// more advanced stack pointer tracking ability.
	// We are not tracking the stack pointer adjustment by the callee, so make
	// sure we restore the stack pointer immediately after the call, there may
	// be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
	MachineBasicBlock::iterator CI = I;
	MachineBasicBlock::iterator B = MBB.begin();
	while (CI != B && !std::prev(CI)->isCall())
	--CI;
	BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /InEpilogue=/false);
	}

	return I;
	}

	bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
	assert(MBB.getParent() && "Block is not attached to a function!");
	const MachineFunction &MF = *MBB.getParent();
	return !TRI->needsStackRealignment(MF) \|\| !MBB.isLiveIn(X86::EFLAGS);
	}

	bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
	assert(MBB.getParent() && "Block is not attached to a function!");

	// Win64 has strict requirements in terms of epilogue and we are
	// not taking a chance at messing with them.
	// I.e., unless this block is already an exit block, we can't use
	// it as an epilogue.
	if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
	return false;

	if (canUseLEAForSPInEpilogue(*MBB.getParent()))
	return true;

	// If we cannot use LEA to adjust SP, we may need to use ADD, which
	// clobbers the EFLAGS. Check that we do not need to preserve it,
	// otherwise, conservatively assume this is not
	// safe to insert the epilogue here.
	return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
	}

	bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
	// If we may need to emit frameless compact unwind information, give
	// up as this is currently broken: PR25614.
	return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) \|\| hasFP(MF)) &&
	// The lowering of segmented stack and HiPE only support entry blocks
	// as prologue blocks: PR26107.
	// This limitation may be lifted if we fix:
	// - adjustForSegmentedStacks
	// - adjustForHiPEPrologue
	MF.getFunction().getCallingConv() != CallingConv::HiPE &&
	!MF.shouldSplitStack();
	}

	MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, bool RestoreSP) const {
	assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
	assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
	assert(STI.is32Bit() && !Uses64BitFramePtr &&
	"restoring EBP/ESI on non-32-bit target");

	MachineFunction &MF = *MBB.getParent();
	Register FramePtr = TRI->getFrameRegister(MF);
	Register BasePtr = TRI->getBaseRegister();
	WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	// FIXME: Don't set FrameSetup flag in catchret case.

	int FI = FuncInfo.EHRegNodeFrameIndex;
	int EHRegSize = MFI.getObjectSize(FI);

	if (RestoreSP) {
	// MOV32rm -EHRegSize(%ebp), %esp
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
	X86::EBP, true, -EHRegSize)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	unsigned UsedReg;
	int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
	int EndOffset = -EHRegOffset - EHRegSize;
	FuncInfo.EHRegNodeEndOffset = EndOffset;

	if (UsedReg == FramePtr) {
	// ADD $offset, %ebp
	unsigned ADDri = getADDriOpcode(false, EndOffset);
	BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
	.addReg(FramePtr)
	.addImm(EndOffset)
	.setMIFlag(MachineInstr::FrameSetup)
	->getOperand(3)
	.setIsDead();
	assert(EndOffset >= 0 &&
	"end of registration object above normal EBP position!");
	} else if (UsedReg == BasePtr) {
	// LEA offset(%ebp), %esi
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
	FramePtr, false, EndOffset)
	.setMIFlag(MachineInstr::FrameSetup);
	// MOV32rm SavedEBPOffset(%esi), %ebp
	assert(X86FI->getHasSEHFramePtrSave());
	int Offset =
	getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
	assert(UsedReg == BasePtr);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
	UsedReg, true, Offset)
	.setMIFlag(MachineInstr::FrameSetup);
	} else {
	llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
	}
	return MBBI;
	}

	int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
	return TRI->getSlotSize();
	}

	unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
	const {
	return TRI->getDwarfRegNum(StackPtr, true);
	}

	namespace {
	// Struct used by orderFrameObjects to help sort the stack objects.
	struct X86FrameSortingObject {
	bool IsValid = false; // true if we care about this Object.
	unsigned ObjectIndex = 0; // Index of Object into MFI list.
	unsigned ObjectSize = 0; // Size of Object in bytes.
	unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
	unsigned ObjectNumUses = 0; // Object static number of uses.
	};

	// The comparison function we use for std::sort to order our local
	// stack symbols. The current algorithm is to use an estimated
	// "density". This takes into consideration the size and number of
	// uses each object has in order to roughly minimize code size.
	// So, for example, an object of size 16B that is referenced 5 times
	// will get higher priority than 4 4B objects referenced 1 time each.
	// It's not perfect and we may be able to squeeze a few more bytes out of
	// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
	// fringe end can have special consideration, given their size is less
	// important, etc.), but the algorithmic complexity grows too much to be
	// worth the extra gains we get. This gets us pretty close.
	// The final order leaves us with objects with highest priority going
	// at the end of our list.
	struct X86FrameSortingComparator {
	inline bool operator()(const X86FrameSortingObject &A,
	const X86FrameSortingObject &B) {
	uint64_t DensityAScaled, DensityBScaled;

	// For consistency in our comparison, all invalid objects are placed
	// at the end. This also allows us to stop walking when we hit the
	// first invalid item after it's all sorted.
	if (!A.IsValid)
	return false;
	if (!B.IsValid)
	return true;

	// The density is calculated by doing :
	// (double)DensityA = A.ObjectNumUses / A.ObjectSize
	// (double)DensityB = B.ObjectNumUses / B.ObjectSize
	// Since this approach may cause inconsistencies in
	// the floating point <, >, == comparisons, depending on the floating
	// point model with which the compiler was built, we're going
	// to scale both sides by multiplying with
	// A.ObjectSize * B.ObjectSize. This ends up factoring away
	// the division and, with it, the need for any floating point
	// arithmetic.
	DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
	static_cast<uint64_t>(B.ObjectSize);
	DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
	static_cast<uint64_t>(A.ObjectSize);

	// If the two densities are equal, prioritize highest alignment
	// objects. This allows for similar alignment objects
	// to be packed together (given the same density).
	// There's room for improvement here, also, since we can pack
	// similar alignment (different density) objects next to each
	// other to save padding. This will also require further
	// complexity/iterations, and the overall gain isn't worth it,
	// in general. Something to keep in mind, though.
	if (DensityAScaled == DensityBScaled)
	return A.ObjectAlignment < B.ObjectAlignment;

	return DensityAScaled < DensityBScaled;
	}
	};
	} // namespace

	// Order the symbols in the local stack.
	// We want to place the local stack objects in some sort of sensible order.
	// The heuristic we use is to try and pack them according to static number
	// of uses and size of object in order to minimize code size.
	void X86FrameLowering::orderFrameObjects(
	const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	// Don't waste time if there's nothing to do.
	if (ObjectsToAllocate.empty())
	return;

	// Create an array of all MFI objects. We won't need all of these
	// objects, but we're going to create a full array of them to make
	// it easier to index into when we're counting "uses" down below.
	// We want to be able to easily/cheaply access an object by simply
	// indexing into it, instead of having to search for it every time.
	std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());

	// Walk the objects we care about and mark them as such in our working
	// struct.
	for (auto &Obj : ObjectsToAllocate) {
	SortingObjects[Obj].IsValid = true;
	SortingObjects[Obj].ObjectIndex = Obj;
	SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
	// Set the size.
	int ObjectSize = MFI.getObjectSize(Obj);
	if (ObjectSize == 0)
	// Variable size. Just use 4.
	SortingObjects[Obj].ObjectSize = 4;
	else
	SortingObjects[Obj].ObjectSize = ObjectSize;
	}

	// Count the number of uses for each object.
	for (auto &MBB : MF) {
	for (auto &MI : MBB) {
	if (MI.isDebugInstr())
	continue;
	for (const MachineOperand &MO : MI.operands()) {
	// Check to see if it's a local stack symbol.
	if (!MO.isFI())
	continue;
	int Index = MO.getIndex();
	// Check to see if it falls within our range, and is tagged
	// to require ordering.
	if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
	SortingObjects[Index].IsValid)
	SortingObjects[Index].ObjectNumUses++;
	}
	}
	}

	// Sort the objects using X86FrameSortingAlgorithm (see its comment for
	// info).
	llvm::stable_sort(SortingObjects, X86FrameSortingComparator());

	// Now modify the original list to represent the final order that
	// we want. The order will depend on whether we're going to access them
	// from the stack pointer or the frame pointer. For SP, the list should
	// end up with the END containing objects that we want with smaller offsets.
	// For FP, it should be flipped.
	int i = 0;
	for (auto &Obj : SortingObjects) {
	// All invalid items are sorted at the end, so it's safe to stop.
	if (!Obj.IsValid)
	break;
	ObjectsToAllocate[i++] = Obj.ObjectIndex;
	}

	// Flip it if we're accessing off of the FP.
	if (!TRI->needsStackRealignment(MF) && hasFP(MF))
	std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
	}


	unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
	// RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
	unsigned Offset = 16;
	// RBP is immediately pushed.
	Offset += SlotSize;
	// All callee-saved registers are then pushed.
	Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
	// Every funclet allocates enough stack space for the largest outgoing call.
	Offset += getWinEHFuncletFrameSize(MF);
	return Offset;
	}

	void X86FrameLowering::processFunctionBeforeFrameFinalized(
	MachineFunction &MF, RegScavenger *RS) const {
	// Mark the function as not having WinCFI. We will set it back to true in
	// emitPrologue if it gets called and emits CFI.
	MF.setHasWinCFI(false);

	// If this function isn't doing Win64-style C++ EH, we don't need to do
	// anything.
	const Function &F = MF.getFunction();
	if (!STI.is64Bit() \|\| !MF.hasEHFunclets() \|\|
	classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX)
	return;

	// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
	// relative to RSP after the prologue. Find the offset of the last fixed
	// object, so that we can allocate a slot immediately following it. If there
	// were no fixed objects, use offset -SlotSize, which is immediately after the
	// return address. Fixed objects have negative frame indices.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
	int64_t MinFixedObjOffset = -SlotSize;
	for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
	MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));

	for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
	for (WinEHHandlerType &H : TBME.HandlerArray) {
	int FrameIndex = H.CatchObj.FrameIndex;
	if (FrameIndex != INT_MAX) {
	// Ensure alignment.
	unsigned Align = MFI.getObjectAlignment(FrameIndex);
	MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
	MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
	MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
	}
	}
	}

	// Ensure alignment.
	MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
	int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
	int UnwindHelpFI =
	MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /IsImmutable=/false);
	EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;

	// Store -2 into UnwindHelp on function entry. We have to scan forwards past
	// other frame setup instructions.
	MachineBasicBlock &MBB = MF.front();
	auto MBBI = MBB.begin();
	while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
	++MBBI;

	DebugLoc DL = MBB.findDebugLoc(MBBI);
	addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
	UnwindHelpFI)
	.addImm(-2);
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 362609)
	@@ -1,5304 +1,5304 @@
	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines a DAG pattern matching instruction selector for X86,
	// converting from a legalized dag to a X86 dag.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86RegisterInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicsX86.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <stdint.h>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");

	static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
	cl::desc("Enable setting constant bits to reduce size of mask immediates"),
	cl::Hidden);

	//===----------------------------------------------------------------------===//
	// Pattern Matcher Implementation
	//===----------------------------------------------------------------------===//

	namespace {
	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
	/// numbers for the leaves of the matched tree.
	struct X86ISelAddressMode {
	enum {
	RegBase,
	FrameIndexBase
	} BaseType;

	// This is really a union, discriminated by BaseType!
	SDValue Base_Reg;
	int Base_FrameIndex;

	unsigned Scale;
	SDValue IndexReg;
	int32_t Disp;
	SDValue Segment;
	const GlobalValue *GV;
	const Constant *CP;
	const BlockAddress *BlockAddr;
	const char *ES;
	MCSymbol *MCSym;
	int JT;
	unsigned Align; // CP alignment.
	unsigned char SymbolFlags; // X86II::MO_*
	bool NegateIndex = false;

	X86ISelAddressMode()
	: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
	Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
	MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}

	bool hasSymbolicDisplacement() const {
	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
	MCSym != nullptr \|\| JT != -1 \|\| BlockAddr != nullptr;
	}

	bool hasBaseOrIndexReg() const {
	return BaseType == FrameIndexBase \|\|
	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
	}

	/// Return true if this addressing mode is already RIP-relative.
	bool isRIPRelative() const {
	if (BaseType != RegBase) return false;
	if (RegisterSDNode *RegNode =
	dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
	return RegNode->getReg() == X86::RIP;
	return false;
	}

	void setBaseReg(SDValue Reg) {
	BaseType = RegBase;
	Base_Reg = Reg;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void dump(SelectionDAG *DAG = nullptr) {
	dbgs() << "X86ISelAddressMode " << this << '\n';
	dbgs() << "Base_Reg ";
	if (Base_Reg.getNode())
	Base_Reg.getNode()->dump(DAG);
	else
	dbgs() << "nul\n";
	if (BaseType == FrameIndexBase)
	dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
	dbgs() << " Scale " << Scale << '\n'
	<< "IndexReg ";
	if (NegateIndex)
	dbgs() << "negate ";
	if (IndexReg.getNode())
	IndexReg.getNode()->dump(DAG);
	else
	dbgs() << "nul\n";
	dbgs() << " Disp " << Disp << '\n'
	<< "GV ";
	if (GV)
	GV->dump();
	else
	dbgs() << "nul";
	dbgs() << " CP ";
	if (CP)
	CP->dump();
	else
	dbgs() << "nul";
	dbgs() << '\n'
	<< "ES ";
	if (ES)
	dbgs() << ES;
	else
	dbgs() << "nul";
	dbgs() << " MCSym ";
	if (MCSym)
	dbgs() << MCSym;
	else
	dbgs() << "nul";
	dbgs() << " JT" << JT << " Align" << Align << '\n';
	}
	#endif
	};
	}

	namespace {
	//===--------------------------------------------------------------------===//
	/// ISel - X86-specific code to select X86 machine instructions for
	/// SelectionDAG operations.
	///
	class X86DAGToDAGISel final : public SelectionDAGISel {
	/// Keep a pointer to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget *Subtarget;

	/// If true, selector should try to optimize for code size instead of
	/// performance.
	bool OptForSize;

	/// If true, selector should try to optimize for minimum code size.
	bool OptForMinSize;

	/// Disable direct TLS access through segment registers.
	bool IndirectTlsSegRefs;

	public:
	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
	: SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
	OptForMinSize(false), IndirectTlsSegRefs(false) {}

	StringRef getPassName() const override {
	return "X86 DAG->DAG Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	// Reset the subtarget each time through.
	Subtarget = &MF.getSubtarget<X86Subtarget>();
	IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
	"indirect-tls-seg-refs");

	// OptFor[Min]Size are used in pattern predicates that isel is matching.
	OptForSize = MF.getFunction().hasOptSize();
	OptForMinSize = MF.getFunction().hasMinSize();
	assert((!OptForMinSize \|\| OptForSize) &&
	"OptForMinSize implies OptForSize");

	SelectionDAGISel::runOnMachineFunction(MF);
	return true;
	}

	void EmitFunctionEntryCode() override;

	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;

	void PreprocessISelDAG() override;
	void PostprocessISelDAG() override;

	// Include the pieces autogenerated from the target description.
	#include "X86GenDAGISel.inc"

	private:
	void Select(SDNode *N) override;

	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
	bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth);
	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
	bool selectLEAAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectLEA64_32Addr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectScalarSSELoad(SDNode Root, SDNode Parent, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment,
	SDValue &NodeWithChain);
	bool selectRelocImm(SDValue N, SDValue &Op);

	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment);

	// Convenience method where P is also root.
	bool tryFoldLoad(SDNode *P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
	}

	bool tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment);

	/// Implement addressing mode selection for inline asm expressions.
	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
	unsigned ConstraintID,
	std::vector<SDValue> &OutOps) override;

	void emitSpecialCodeForMain();

	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
	MVT VT, SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	Base = CurDAG->getTargetFrameIndex(
	AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
	else if (AM.Base_Reg.getNode())
	Base = AM.Base_Reg;
	else
	Base = CurDAG->getRegister(0, VT);

	Scale = getI8Imm(AM.Scale, DL);

	// Negate the index if needed.
	if (AM.NegateIndex) {
	unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
	SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
	AM.IndexReg), 0);
	AM.IndexReg = Neg;
	}

	if (AM.IndexReg.getNode())
	Index = AM.IndexReg;
	else
	Index = CurDAG->getRegister(0, VT);

	// These are 32-bit even in 64-bit mode since RIP-relative offset
	// is 32-bit.
	if (AM.GV)
	Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
	MVT::i32, AM.Disp,
	AM.SymbolFlags);
	else if (AM.CP)
	Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
	AM.Align, AM.Disp, AM.SymbolFlags);
	else if (AM.ES) {
	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
	Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
	} else if (AM.MCSym) {
	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
	assert(AM.SymbolFlags == 0 && "oo");
	Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
	} else if (AM.JT != -1) {
	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
	Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
	} else if (AM.BlockAddr)
	Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
	AM.SymbolFlags);
	else
	Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);

	if (AM.Segment.getNode())
	Segment = AM.Segment;
	else
	Segment = CurDAG->getRegister(0, MVT::i16);
	}

	// Utility function to determine whether we should avoid selecting
	// immediate forms of instructions for better code size or not.
	// At a high level, we'd like to avoid such instructions when
	// we have similar constants used within the same basic block
	// that can be kept in a register.
	//
	bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
	uint32_t UseCount = 0;

	// Do not want to hoist if we're not optimizing for size.
	// TODO: We'd like to remove this restriction.
	// See the comment in X86InstrInfo.td for more info.
	if (!CurDAG->shouldOptForSize())
	return false;

	// Walk all the users of the immediate.
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {

	SDNode User = UI;

	// This user is already selected. Count it as a legitimate use and
	// move on.
	if (User->isMachineOpcode()) {
	UseCount++;
	continue;
	}

	// We want to count stores of immediates as real uses.
	if (User->getOpcode() == ISD::STORE &&
	User->getOperand(1).getNode() == N) {
	UseCount++;
	continue;
	}

	// We don't currently match users that have > 2 operands (except
	// for stores, which are handled above)
	// Those instruction won't match in ISEL, for now, and would
	// be counted incorrectly.
	// This may change in the future as we add additional instruction
	// types.
	if (User->getNumOperands() != 2)
	continue;

	// If this can match to INC/DEC, don't count it as a use.
	if (User->getOpcode() == ISD::ADD &&
	(isOneConstant(SDValue(N, 0)) \|\| isAllOnesConstant(SDValue(N, 0))))
	continue;

	// Immediates that are used for offsets as part of stack
	// manipulation should be left alone. These are typically
	// used to indicate SP offsets for argument passing and
	// will get pulled into stores/pushes (implicitly).
	if (User->getOpcode() == X86ISD::ADD \|\|
	User->getOpcode() == ISD::ADD \|\|
	User->getOpcode() == X86ISD::SUB \|\|
	User->getOpcode() == ISD::SUB) {

	// Find the other operand of the add/sub.
	SDValue OtherOp = User->getOperand(0);
	if (OtherOp.getNode() == N)
	OtherOp = User->getOperand(1);

	// Don't count if the other operand is SP.
	RegisterSDNode *RegNode;
	if (OtherOp->getOpcode() == ISD::CopyFromReg &&
	(RegNode = dyn_cast_or_null<RegisterSDNode>(
	OtherOp->getOperand(1).getNode())))
	if ((RegNode->getReg() == X86::ESP) \|\|
	(RegNode->getReg() == X86::RSP))
	continue;
	}

	// ... otherwise, count this and move on.
	UseCount++;
	}

	// If we have more than 1 use, then recommend for hoisting.
	return (UseCount > 1);
	}

	/// Return a target constant with the specified value of type i8.
	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
	}

	/// Return a target constant with the specified value, of type i32.
	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
	}

	/// Return a target constant with the specified value, of type i64.
	inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
	}

	SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
	const SDLoc &DL) {
	assert((VecWidth == 128 \|\| VecWidth == 256) && "Unexpected vector width");
	uint64_t Index = N->getConstantOperandVal(1);
	MVT VecVT = N->getOperand(0).getSimpleValueType();
	return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
	}

	SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
	const SDLoc &DL) {
	assert((VecWidth == 128 \|\| VecWidth == 256) && "Unexpected vector width");
	uint64_t Index = N->getConstantOperandVal(2);
	MVT VecVT = N->getSimpleValueType(0);
	return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
	}

	// Helper to detect unneeded and instructions on shift amounts. Called
	// from PatFrags in tablegen.
	bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
	const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();

	if (Val.countTrailingOnes() >= Width)
	return true;

	APInt Mask = Val \| CurDAG->computeKnownBits(N->getOperand(0)).Zero;
	return Mask.countTrailingOnes() >= Width;
	}

	/// Return an SDNode that returns the value of the global base register.
	/// Output instructions required to initialize the global base register,
	/// if necessary.
	SDNode *getGlobalBaseReg();

	/// Return a reference to the TargetMachine, casted to the target-specific
	/// type.
	const X86TargetMachine &getTargetMachine() const {
	return static_cast<const X86TargetMachine &>(TM);
	}

	/// Return a reference to the TargetInstrInfo, casted to the target-specific
	/// type.
	const X86InstrInfo *getInstrInfo() const {
	return Subtarget->getInstrInfo();
	}

	/// Address-mode matching performs shift-of-and to and-of-shift
	/// reassociation in order to expose more scaled addressing
	/// opportunities.
	bool ComplexPatternFuncMutatesDAG() const override {
	return true;
	}

	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;

	/// Returns whether this is a relocatable immediate in the range
	/// [-2^Width .. 2^Width-1].
	template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
	if (auto *CN = dyn_cast<ConstantSDNode>(N))
	return isInt<Width>(CN->getSExtValue());
	return isSExtAbsoluteSymbolRef(Width, N);
	}

	// Indicates we should prefer to use a non-temporal load for this load.
	bool useNonTemporalLoad(LoadSDNode *N) const {
	if (!N->isNonTemporal())
	return false;

	unsigned StoreSize = N->getMemoryVT().getStoreSize();

	if (N->getAlignment() < StoreSize)
	return false;

	switch (StoreSize) {
	default: llvm_unreachable("Unsupported store size");
	case 4:
	case 8:
	return false;
	case 16:
	return Subtarget->hasSSE41();
	case 32:
	return Subtarget->hasAVX2();
	case 64:
	return Subtarget->hasAVX512();
	}
	}

	bool foldLoadStoreIntoMemOperand(SDNode *Node);
	MachineSDNode matchBEXTRFromAndImm(SDNode Node);
	bool matchBitExtract(SDNode *Node);
	bool shrinkAndImmediate(SDNode *N);
	bool isMaskZeroExtended(SDNode *N) const;
	bool tryShiftAmountMod(SDNode *N);
	bool combineIncDecVector(SDNode *Node);
	bool tryShrinkShlLogicImm(SDNode *N);
	bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
	bool tryMatchBitSelect(SDNode *N);

	MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
	const SDLoc &dl, MVT VT, SDNode *Node);
	MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
	const SDLoc &dl, MVT VT, SDNode *Node,
	SDValue &InFlag);

	bool tryOptimizeRem8Extend(SDNode *N);

	bool onlyUsesZeroFlag(SDValue Flags) const;
	bool hasNoSignFlagUses(SDValue Flags) const;
	bool hasNoCarryFlagUses(SDValue Flags) const;
	};
	}


	// Returns true if this masked compare can be implemented legally with this
	// type.
	static bool isLegalMaskCompare(SDNode N, const X86Subtarget Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == X86ISD::CMPM \|\| Opcode == X86ISD::STRICT_CMPM \|\|
	Opcode == ISD::SETCC \|\| Opcode == X86ISD::CMPM_SAE \|\|
	Opcode == X86ISD::VFPCLASS) {
	// We can get 256-bit 8 element types here without VLX being enabled. When
	// this happens we will use 512-bit operations and the mask will not be
	// zero extended.
	EVT OpVT = N->getOperand(0).getValueType();
	// The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
	// second operand.
	if (Opcode == X86ISD::STRICT_CMPM)
	OpVT = N->getOperand(1).getValueType();
	if (OpVT.is256BitVector() \|\| OpVT.is128BitVector())
	return Subtarget->hasVLX();

	return true;
	}
	// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
	if (Opcode == X86ISD::VFPCLASSS \|\| Opcode == X86ISD::FSETCCM \|\|
	Opcode == X86ISD::FSETCCM_SAE)
	return true;

	return false;
	}

	// Returns true if we can assume the writer of the mask has zero extended it
	// for us.
	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
	// If this is an AND, check if we have a compare on either side. As long as
	// one side guarantees the mask is zero extended, the AND will preserve those
	// zeros.
	if (N->getOpcode() == ISD::AND)
	return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) \|\|
	isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);

	return isLegalMaskCompare(N, Subtarget);
	}

	bool
	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
	if (OptLevel == CodeGenOpt::None) return false;

	if (!N.hasOneUse())
	return false;

	// FIXME: Temporary hack to prevent strict floating point nodes from
	// folding into masked operations illegally.
	if (U == Root && Root->getOpcode() == ISD::VSELECT &&
	N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD)
	return false;

	if (N.getOpcode() != ISD::LOAD)
	return true;

	// Don't fold non-temporal loads if we have an instruction for them.
	if (useNonTemporalLoad(cast<LoadSDNode>(N)))
	return false;

	// If N is a load, do additional profitability checks.
	if (U == Root) {
	switch (U->getOpcode()) {
	default: break;
	case X86ISD::ADD:
	case X86ISD::ADC:
	case X86ISD::SUB:
	case X86ISD::SBB:
	case X86ISD::AND:
	case X86ISD::XOR:
	case X86ISD::OR:
	case ISD::ADD:
	case ISD::ADDCARRY:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	SDValue Op1 = U->getOperand(1);

	// If the other operand is a 8-bit immediate we should fold the immediate
	// instead. This reduces code size.
	// e.g.
	// movl 4(%esp), %eax
	// addl $4, %eax
	// vs.
	// movl $4, %eax
	// addl 4(%esp), %eax
	// The former is 2 bytes shorter. In case where the increment is 1, then
	// the saving can be 4 bytes (by using incl %eax).
	if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
	if (Imm->getAPIntValue().isSignedIntN(8))
	return false;

	// If this is a 64-bit AND with an immediate that fits in 32-bits,
	// prefer using the smaller and over folding the load. This is needed to
	// make sure immediates created by shrinkAndImmediate are always folded.
	// Ideally we would narrow the load during DAG combine and get the
	// best of both worlds.
	if (U->getOpcode() == ISD::AND &&
	Imm->getAPIntValue().getBitWidth() == 64 &&
	Imm->getAPIntValue().isIntN(32))
	return false;

	// If this really a zext_inreg that can be represented with a movzx
	// instruction, prefer that.
	// TODO: We could shrink the load and fold if it is non-volatile.
	if (U->getOpcode() == ISD::AND &&
	(Imm->getAPIntValue() == UINT8_MAX \|\|
	Imm->getAPIntValue() == UINT16_MAX \|\|
	Imm->getAPIntValue() == UINT32_MAX))
	return false;

	// ADD/SUB with can negate the immediate and use the opposite operation
	// to fit 128 into a sign extended 8 bit immediate.
	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB) &&
	(-Imm->getAPIntValue()).isSignedIntN(8))
	return false;
	}

	// If the other operand is a TLS address, we should fold it instead.
	// This produces
	// movl %gs:0, %eax
	// leal i@NTPOFF(%eax), %eax
	// instead of
	// movl $i@NTPOFF, %eax
	// addl %gs:0, %eax
	// if the block also has an access to a second TLS address this will save
	// a load.
	// FIXME: This is probably also true for non-TLS addresses.
	if (Op1.getOpcode() == X86ISD::Wrapper) {
	SDValue Val = Op1.getOperand(0);
	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;
	}

	// Don't fold load if this matches the BTS/BTR/BTC patterns.
	// BTS: (or X, (shl 1, n))
	// BTR: (and X, (rotl -2, n))
	// BTC: (xor X, (shl 1, n))
	if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
	if (U->getOperand(0).getOpcode() == ISD::SHL &&
	isOneConstant(U->getOperand(0).getOperand(0)))
	return false;

	if (U->getOperand(1).getOpcode() == ISD::SHL &&
	isOneConstant(U->getOperand(1).getOperand(0)))
	return false;
	}
	if (U->getOpcode() == ISD::AND) {
	SDValue U0 = U->getOperand(0);
	SDValue U1 = U->getOperand(1);
	if (U0.getOpcode() == ISD::ROTL) {
	auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
	if (C && C->getSExtValue() == -2)
	return false;
	}

	if (U1.getOpcode() == ISD::ROTL) {
	auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
	if (C && C->getSExtValue() == -2)
	return false;
	}
	}

	break;
	}
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	// Don't fold a load into a shift by immediate. The BMI2 instructions
	// support folding a load, but not an immediate. The legacy instructions
	// support folding an immediate, but can't fold a load. Folding an
	// immediate is preferable to folding a load.
	if (isa<ConstantSDNode>(U->getOperand(1)))
	return false;

	break;
	}
	}

	// Prevent folding a load if this can implemented with an insert_subreg or
	// a move that implicitly zeroes.
	if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
	isNullConstant(Root->getOperand(2)) &&
	(Root->getOperand(0).isUndef() \|\|
	ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
	return false;

	return true;
	}

	/// Replace the original chain operand of the call with
	/// load's chain operand and move load below the call's chain operand.
	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
	SDValue Call, SDValue OrigChain) {
	SmallVector<SDValue, 8> Ops;
	SDValue Chain = OrigChain.getOperand(0);
	if (Chain.getNode() == Load.getNode())
	Ops.push_back(Load.getOperand(0));
	else {
	assert(Chain.getOpcode() == ISD::TokenFactor &&
	"Unexpected chain operand");
	for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
	if (Chain.getOperand(i).getNode() == Load.getNode())
	Ops.push_back(Load.getOperand(0));
	else
	Ops.push_back(Chain.getOperand(i));
	SDValue NewChain =
	CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
	Ops.clear();
	Ops.push_back(NewChain);
	}
	Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
	CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
	CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
	Load.getOperand(1), Load.getOperand(2));

	Ops.clear();
	Ops.push_back(SDValue(Load.getNode(), 1));
	Ops.append(Call->op_begin() + 1, Call->op_end());
	CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
	}

	/// Return true if call address is a load and it can be
	/// moved below CALLSEQ_START and the chains leading up to the call.
	/// Return the CALLSEQ_START by reference as a second output.
	/// In the case of a tail call, there isn't a callseq node between the call
	/// chain and the load.
	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
	// The transformation is somewhat dangerous if the call's chain was glued to
	// the call. After MoveBelowOrigChain the load is moved between the call and
	// the chain, this can create a cycle if the load is not folded. So it is
	// really important that we are sure the load will be folded.
	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
	return false;
	LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
	if (!LD \|\|
	!LD->isSimple() \|\|
	LD->getAddressingMode() != ISD::UNINDEXED \|\|
	LD->getExtensionType() != ISD::NON_EXTLOAD)
	return false;

	// Now let's find the callseq_start.
	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
	if (!Chain.hasOneUse())
	return false;
	Chain = Chain.getOperand(0);
	}

	if (!Chain.getNumOperands())
	return false;
	// Since we are not checking for AA here, conservatively abort if the chain
	// writes to memory. It's not safe to move the callee (a load) across a store.
	if (isa<MemSDNode>(Chain.getNode()) &&
	cast<MemSDNode>(Chain.getNode())->writeMem())
	return false;
	if (Chain.getOperand(0).getNode() == Callee.getNode())
	return true;
	if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
	Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
	Callee.getValue(1).hasOneUse())
	return true;
	return false;
	}

	void X86DAGToDAGISel::PreprocessISelDAG() {
	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
	E = CurDAG->allnodes_end(); I != E; ) {
	SDNode N = &I++; // Preincrement iterator to avoid invalidation issues.

	// If this is a target specific AND node with no flag usages, turn it back
	// into ISD::AND to enable test instruction matching.
	if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
	SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}

	switch (N->getOpcode()) {
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT: {
	// Replace vector fp_to_s/uint with their X86 specific equivalent so we
	// don't need 2 sets of patterns.
	if (!N->getSimpleValueType(0).isVector())
	break;

	unsigned NewOpc;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
	case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
	case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
	case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
	case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
	case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
	}
	SDValue Res;
	if (N->isStrictFPOpcode())
	Res =
	CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
	{N->getOperand(0), N->getOperand(1)});
	else
	Res =
	CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
	N->getOperand(0));
	--I;
	CurDAG->ReplaceAllUsesWith(N, Res.getNode());
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	// Replace vector shifts with their X86 specific equivalent so we don't
	// need 2 sets of patterns.
	if (!N->getValueType(0).isVector())
	break;

	unsigned NewOpc;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
	case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
	case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
	}
	SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	// Replace vector any extend with the zero extend equivalents so we don't
	// need 2 sets of patterns. Ignore vXi1 extensions.
	if (!N->getValueType(0).isVector() \|\|
	N->getOperand(0).getScalarValueSizeInBits() == 1)
	break;

	unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
	? ISD::ZERO_EXTEND
	: ISD::ZERO_EXTEND_VECTOR_INREG;

	SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
	N->getOperand(0));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case ISD::FCEIL:
	case ISD::STRICT_FCEIL:
	case ISD::FFLOOR:
	case ISD::STRICT_FFLOOR:
	case ISD::FTRUNC:
	case ISD::STRICT_FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::STRICT_FNEARBYINT:
	case ISD::FRINT:
	case ISD::STRICT_FRINT: {
	// Replace fp rounding with their X86 specific equivalent so we don't
	// need 2 sets of patterns.
	unsigned Imm;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::STRICT_FCEIL:
	case ISD::FCEIL: Imm = 0xA; break;
	case ISD::STRICT_FFLOOR:
	case ISD::FFLOOR: Imm = 0x9; break;
	case ISD::STRICT_FTRUNC:
	case ISD::FTRUNC: Imm = 0xB; break;
	case ISD::STRICT_FNEARBYINT:
	case ISD::FNEARBYINT: Imm = 0xC; break;
	case ISD::STRICT_FRINT:
	case ISD::FRINT: Imm = 0x4; break;
	}
	SDLoc dl(N);
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Res;
	if (IsStrict)
	Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
	{N->getValueType(0), MVT::Other},
	{N->getOperand(0), N->getOperand(1),
	CurDAG->getTargetConstant(Imm, dl, MVT::i8)});
	else
	Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
	N->getOperand(0),
	CurDAG->getTargetConstant(Imm, dl, MVT::i8));
	--I;
	CurDAG->ReplaceAllUsesWith(N, Res.getNode());
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case X86ISD::FANDN:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR: {
	// Widen scalar fp logic ops to vector to reduce isel patterns.
	// FIXME: Can we do this during lowering/combine.
	MVT VT = N->getSimpleValueType(0);
	if (VT.isVector() \|\| VT == MVT::f128)
	break;

	MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
	SDLoc dl(N);
	SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
	N->getOperand(0));
	SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
	N->getOperand(1));

	SDValue Res;
	if (Subtarget->hasSSE2()) {
	EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
	Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
	Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
	unsigned Opc;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
	case X86ISD::FAND: Opc = ISD::AND; break;
	case X86ISD::FOR: Opc = ISD::OR; break;
	case X86ISD::FXOR: Opc = ISD::XOR; break;
	}
	Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
	Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
	} else {
	Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
	}
	Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
	CurDAG->getIntPtrConstant(0, dl));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	}

	if (OptLevel != CodeGenOpt::None &&
	// Only do this when the target can fold the load into the call or
	// jmp.
	- !Subtarget->useRetpolineIndirectCalls() &&
	+ !Subtarget->useIndirectThunkCalls() &&
	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
	(N->getOpcode() == X86ISD::TC_RETURN &&
	(Subtarget->is64Bit() \|\|
	!getTargetMachine().isPositionIndependent())))) {
	/// Also try moving call address load from outside callseq_start to just
	/// before the call to allow it to be folded.
	///
	/// [Load chain]
	/// ^
	/// \|
	/// [Load]
	/// ^ ^
	/// \| \|
	/// / \--
	/// / \|
	///[CALLSEQ_START] \|
	/// ^ \|
	/// \| \|
	/// [LOAD/C2Reg] \|
	/// \| \|
	/// \ /
	/// \ /
	/// [CALL]
	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
	SDValue Chain = N->getOperand(0);
	SDValue Load = N->getOperand(1);
	if (!isCalleeLoad(Load, Chain, HasCallSeq))
	continue;
	moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
	++NumLoadMoved;
	continue;
	}

	// Lower fpround and fpextend nodes that target the FP stack to be store and
	// load to the stack. This is a gross hack. We would like to simply mark
	// these as being illegal, but when we do that, legalize produces these when
	// it expands calls, then expands these in the same legalize pass. We would
	// like dag combine to be able to hack on these between the call expansion
	// and the node legalization. As such this pass basically does "really
	// late" legalization of these inline with the X86 isel pass.
	// FIXME: This should only happen when not compiled with -O0.
	switch (N->getOpcode()) {
	default: continue;
	case ISD::FP_ROUND:
	case ISD::FP_EXTEND:
	{
	MVT SrcVT = N->getOperand(0).getSimpleValueType();
	MVT DstVT = N->getSimpleValueType(0);

	// If any of the sources are vectors, no fp stack involved.
	if (SrcVT.isVector() \|\| DstVT.isVector())
	continue;

	// If the source and destination are SSE registers, then this is a legal
	// conversion that should not be lowered.
	const X86TargetLowering *X86Lowering =
	static_cast<const X86TargetLowering *>(TLI);
	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
	if (SrcIsSSE && DstIsSSE)
	continue;

	if (!SrcIsSSE && !DstIsSSE) {
	// If this is an FPStack extension, it is a noop.
	if (N->getOpcode() == ISD::FP_EXTEND)
	continue;
	// If this is a value-preserving FPStack truncation, it is a noop.
	if (N->getConstantOperandVal(1))
	continue;
	}

	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
	// FPStack has extload and truncstore. SSE can fold direct loads into other
	// operations. Based on this, decide what we want to do.
	MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
	SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
	SDLoc dl(N);

	// FIXME: optimize the case where the src/dest is a load or store?

	SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
	MemTmp, MachinePointerInfo(), MemVT);
	SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
	MachinePointerInfo(), MemVT);

	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
	// extload we created. This will cause general havok on the dag because
	// anything below the conversion could be folded into other existing nodes.
	// To avoid invalidating 'I', back it up to the convert node.
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
	break;
	}

	//The sequence of events for lowering STRICT_FP versions of these nodes requires
	//dealing with the chain differently, as there is already a preexisting chain.
	case ISD::STRICT_FP_ROUND:
	case ISD::STRICT_FP_EXTEND:
	{
	MVT SrcVT = N->getOperand(1).getSimpleValueType();
	MVT DstVT = N->getSimpleValueType(0);

	// If any of the sources are vectors, no fp stack involved.
	if (SrcVT.isVector() \|\| DstVT.isVector())
	continue;

	// If the source and destination are SSE registers, then this is a legal
	// conversion that should not be lowered.
	const X86TargetLowering *X86Lowering =
	static_cast<const X86TargetLowering *>(TLI);
	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
	if (SrcIsSSE && DstIsSSE)
	continue;

	if (!SrcIsSSE && !DstIsSSE) {
	// If this is an FPStack extension, it is a noop.
	if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
	continue;
	// If this is a value-preserving FPStack truncation, it is a noop.
	if (N->getConstantOperandVal(2))
	continue;
	}

	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
	// FPStack has extload and truncstore. SSE can fold direct loads into other
	// operations. Based on this, decide what we want to do.
	MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
	SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
	SDLoc dl(N);

	// FIXME: optimize the case where the src/dest is a load or store?

	//Since the operation is StrictFP, use the preexisting chain.
	SDValue Store, Result;
	if (!SrcIsSSE) {
	SDVTList VTs = CurDAG->getVTList(MVT::Other);
	SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
	Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
	MachinePointerInfo(), 0,
	MachineMemOperand::MOStore);
	if (N->getFlags().hasNoFPExcept()) {
	SDNodeFlags Flags = Store->getFlags();
	Flags.setNoFPExcept(true);
	Store->setFlags(Flags);
	}
	} else {
	assert(SrcVT == MemVT && "Unexpected VT!");
	Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
	MachinePointerInfo());
	}

	if (!DstIsSSE) {
	SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
	SDValue Ops[] = {Store, MemTmp};
	Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT,
	MachinePointerInfo(), 0,
	MachineMemOperand::MOLoad);
	if (N->getFlags().hasNoFPExcept()) {
	SDNodeFlags Flags = Result->getFlags();
	Flags.setNoFPExcept(true);
	Result->setFlags(Flags);
	}
	} else {
	assert(DstVT == MemVT && "Unexpected VT!");
	Result =
	CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo());
	}

	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
	// extload we created. This will cause general havok on the dag because
	// anything below the conversion could be folded into other existing nodes.
	// To avoid invalidating 'I', back it up to the convert node.
	--I;
	CurDAG->ReplaceAllUsesWith(N, Result.getNode());
	break;
	}
	}


	// Now that we did that, the node is dead. Increment the iterator to the
	// next node to process, then delete N.
	++I;
	CurDAG->DeleteNode(N);
	}

	// The load+call transform above can leave some dead nodes in the graph. Make
	// sure we remove them. Its possible some of the other transforms do to so
	// just remove dead nodes unconditionally.
	CurDAG->RemoveDeadNodes();
	}

	// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
	bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
	unsigned Opc = N->getMachineOpcode();
	if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
	Opc != X86::MOVSX64rr8)
	return false;

	SDValue N0 = N->getOperand(0);

	// We need to be extracting the lower bit of an extend.
	if (!N0.isMachineOpcode() \|\|
	N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG \|\|
	N0.getConstantOperandVal(1) != X86::sub_8bit)
	return false;

	// We're looking for either a movsx or movzx to match the original opcode.
	unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
	: X86::MOVSX32rr8_NOREX;
	SDValue N00 = N0.getOperand(0);
	if (!N00.isMachineOpcode() \|\| N00.getMachineOpcode() != ExpectedOpc)
	return false;

	if (Opc == X86::MOVSX64rr8) {
	// If we had a sign extend from 8 to 64 bits. We still need to go from 32
	// to 64.
	MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
	MVT::i64, N00);
	ReplaceUses(N, Extend);
	} else {
	// Ok we can drop this extend and just use the original extend.
	ReplaceUses(N, N00.getNode());
	}

	return true;
	}

	void X86DAGToDAGISel::PostprocessISelDAG() {
	// Skip peepholes at -O0.
	if (TM.getOptLevel() == CodeGenOpt::None)
	return;

	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();

	bool MadeChange = false;
	while (Position != CurDAG->allnodes_begin()) {
	SDNode N = &--Position;
	// Skip dead nodes and any non-machine opcodes.
	if (N->use_empty() \|\| !N->isMachineOpcode())
	continue;

	if (tryOptimizeRem8Extend(N)) {
	MadeChange = true;
	continue;
	}

	// Look for a TESTrr+ANDrr pattern where both operands of the test are
	// the same. Rewrite to remove the AND.
	unsigned Opc = N->getMachineOpcode();
	if ((Opc == X86::TEST8rr \|\| Opc == X86::TEST16rr \|\|
	Opc == X86::TEST32rr \|\| Opc == X86::TEST64rr) &&
	N->getOperand(0) == N->getOperand(1) &&
	N->isOnlyUserOf(N->getOperand(0).getNode()) &&
	N->getOperand(0).isMachineOpcode()) {
	SDValue And = N->getOperand(0);
	unsigned N0Opc = And.getMachineOpcode();
	if (N0Opc == X86::AND8rr \|\| N0Opc == X86::AND16rr \|\|
	N0Opc == X86::AND32rr \|\| N0Opc == X86::AND64rr) {
	MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
	MVT::i32,
	And.getOperand(0),
	And.getOperand(1));
	ReplaceUses(N, Test);
	MadeChange = true;
	continue;
	}
	if (N0Opc == X86::AND8rm \|\| N0Opc == X86::AND16rm \|\|
	N0Opc == X86::AND32rm \|\| N0Opc == X86::AND64rm) {
	unsigned NewOpc;
	switch (N0Opc) {
	case X86::AND8rm: NewOpc = X86::TEST8mr; break;
	case X86::AND16rm: NewOpc = X86::TEST16mr; break;
	case X86::AND32rm: NewOpc = X86::TEST32mr; break;
	case X86::AND64rm: NewOpc = X86::TEST64mr; break;
	}

	// Need to swap the memory and register operand.
	SDValue Ops[] = { And.getOperand(1),
	And.getOperand(2),
	And.getOperand(3),
	And.getOperand(4),
	And.getOperand(5),
	And.getOperand(0),
	And.getOperand(6) /* Chain */ };
	MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
	MVT::i32, MVT::Other, Ops);
	ReplaceUses(N, Test);
	MadeChange = true;
	continue;
	}
	}

	// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
	// used. We're doing this late so we can prefer to fold the AND into masked
	// comparisons. Doing that can be better for the live range of the mask
	// register.
	if ((Opc == X86::KORTESTBrr \|\| Opc == X86::KORTESTWrr \|\|
	Opc == X86::KORTESTDrr \|\| Opc == X86::KORTESTQrr) &&
	N->getOperand(0) == N->getOperand(1) &&
	N->isOnlyUserOf(N->getOperand(0).getNode()) &&
	N->getOperand(0).isMachineOpcode() &&
	onlyUsesZeroFlag(SDValue(N, 0))) {
	SDValue And = N->getOperand(0);
	unsigned N0Opc = And.getMachineOpcode();
	// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
	// KAND instructions and KTEST use the same ISA feature.
	if (N0Opc == X86::KANDBrr \|\|
	(N0Opc == X86::KANDWrr && Subtarget->hasDQI()) \|\|
	N0Opc == X86::KANDDrr \|\| N0Opc == X86::KANDQrr) {
	unsigned NewOpc;
	switch (Opc) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
	case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
	case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
	case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
	}
	MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
	MVT::i32,
	And.getOperand(0),
	And.getOperand(1));
	ReplaceUses(N, KTest);
	MadeChange = true;
	continue;
	}
	}

	// Attempt to remove vectors moves that were inserted to zero upper bits.
	if (Opc != TargetOpcode::SUBREG_TO_REG)
	continue;

	unsigned SubRegIdx = N->getConstantOperandVal(2);
	if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
	continue;

	SDValue Move = N->getOperand(1);
	if (!Move.isMachineOpcode())
	continue;

	// Make sure its one of the move opcodes we recognize.
	switch (Move.getMachineOpcode()) {
	default:
	continue;
	case X86::VMOVAPDrr: case X86::VMOVUPDrr:
	case X86::VMOVAPSrr: case X86::VMOVUPSrr:
	case X86::VMOVDQArr: case X86::VMOVDQUrr:
	case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
	case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
	case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
	case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
	case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
	case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
	case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
	case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
	case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
	case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
	case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
	break;
	}

	SDValue In = Move.getOperand(0);
	if (!In.isMachineOpcode() \|\|
	In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
	continue;

	// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
	// the SHA instructions which use a legacy encoding.
	uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
	if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
	(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
	(TSFlags & X86II::EncodingMask) != X86II::XOP)
	continue;

	// Producing instruction is another vector instruction. We can drop the
	// move.
	CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
	MadeChange = true;
	}

	if (MadeChange)
	CurDAG->RemoveDeadNodes();
	}


	/// Emit any code that needs to be executed only in the main function.
	void X86DAGToDAGISel::emitSpecialCodeForMain() {
	if (Subtarget->isTargetCygMing()) {
	TargetLowering::ArgListTy Args;
	auto &DL = CurDAG->getDataLayout();

	TargetLowering::CallLoweringInfo CLI(*CurDAG);
	CLI.setChain(CurDAG->getRoot())
	.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
	CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
	std::move(Args));
	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	CurDAG->setRoot(Result.second);
	}
	}

	void X86DAGToDAGISel::EmitFunctionEntryCode() {
	// If this is main, emit special code for main.
	const Function &F = MF->getFunction();
	if (F.hasExternalLinkage() && F.getName() == "main")
	emitSpecialCodeForMain();
	}

	static bool isDispSafeForFrameIndex(int64_t Val) {
	// On 64-bit platforms, we can run into an issue where a frame index
	// includes a displacement that, when added to the explicit displacement,
	// will overflow the displacement field. Assuming that the frame index
	// displacement fits into a 31-bit integer (which is only slightly more
	// aggressive than the current fundamental assumption that it fits into
	// a 32-bit integer), a 31-bit disp should always be safe.
	return isInt<31>(Val);
	}

	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
	X86ISelAddressMode &AM) {
	// If there's no offset to fold, we don't need to do any work.
	if (Offset == 0)
	return false;

	// Cannot combine ExternalSymbol displacements with integer offsets.
	if (AM.ES \|\| AM.MCSym)
	return true;

	int64_t Val = AM.Disp + Offset;
	CodeModel::Model M = TM.getCodeModel();
	if (Subtarget->is64Bit()) {
	if (!X86::isOffsetSuitableForCodeModel(Val, M,
	AM.hasSymbolicDisplacement()))
	return true;
	// In addition to the checks required for a register base, check that
	// we do not try to use an unsafe Disp with a frame index.
	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
	!isDispSafeForFrameIndex(Val))
	return true;
	}
	AM.Disp = Val;
	return false;

	}

	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
	SDValue Address = N->getOperand(1);

	// load gs:0 -> GS segment register.
	// load fs:0 -> FS segment register.
	//
	// This optimization is valid because the GNU TLS model defines that
	// gs:0 (or fs:0 on X86-64) contains its own address.
	// For more information see http://people.redhat.com/drepper/tls.pdf
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
	if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
	!IndirectTlsSegRefs &&
	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetAndroid() \|\|
	Subtarget->isTargetFuchsia()))
	switch (N->getPointerInfo().getAddrSpace()) {
	case 256:
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	return false;
	case 257:
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	return false;
	// Address space 258 is not handled here, because it is not used to
	// address TLS areas.
	}

	return true;
	}

	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
	/// mode. These wrap things that will resolve down into a symbol reference.
	/// If no match is possible, this returns true, otherwise it returns false.
	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
	// If the addressing mode already has a symbol as the displacement, we can
	// never match another symbol.
	if (AM.hasSymbolicDisplacement())
	return true;

	bool IsRIPRelTLS = false;
	bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
	if (IsRIPRel) {
	SDValue Val = N.getOperand(0);
	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
	IsRIPRelTLS = true;
	}

	// We can't use an addressing mode in the 64-bit large code model.
	// Global TLS addressing is an exception. In the medium code model,
	// we use can use a mode when RIP wrappers are present.
	// That signifies access to globals that are known to be "near",
	// such as the GOT itself.
	CodeModel::Model M = TM.getCodeModel();
	if (Subtarget->is64Bit() &&
	((M == CodeModel::Large && !IsRIPRelTLS) \|\|
	(M == CodeModel::Medium && !IsRIPRel)))
	return true;

	// Base and index reg must be 0 in order to use %rip as base.
	if (IsRIPRel && AM.hasBaseOrIndexReg())
	return true;

	// Make a local copy in case we can't do this fold.
	X86ISelAddressMode Backup = AM;

	int64_t Offset = 0;
	SDValue N0 = N.getOperand(0);
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
	AM.GV = G->getGlobal();
	AM.SymbolFlags = G->getTargetFlags();
	Offset = G->getOffset();
	} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
	AM.CP = CP->getConstVal();
	AM.Align = CP->getAlignment();
	AM.SymbolFlags = CP->getTargetFlags();
	Offset = CP->getOffset();
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
	AM.ES = S->getSymbol();
	AM.SymbolFlags = S->getTargetFlags();
	} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
	AM.MCSym = S->getMCSymbol();
	} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
	AM.JT = J->getIndex();
	AM.SymbolFlags = J->getTargetFlags();
	} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
	AM.BlockAddr = BA->getBlockAddress();
	AM.SymbolFlags = BA->getTargetFlags();
	Offset = BA->getOffset();
	} else
	llvm_unreachable("Unhandled symbol reference node.");

	if (foldOffsetIntoAddress(Offset, AM)) {
	AM = Backup;
	return true;
	}

	if (IsRIPRel)
	AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));

	// Commit the changes now that we know this fold is safe.
	return false;
	}

	/// Add the specified node to the specified addressing mode, returning true if
	/// it cannot be done. This just pattern matches for the addressing mode.
	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
	if (matchAddressRecursively(N, AM, 0))
	return true;

	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
	// a smaller encoding and avoids a scaled-index.
	if (AM.Scale == 2 &&
	AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr) {
	AM.Base_Reg = AM.IndexReg;
	AM.Scale = 1;
	}

	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
	// because it has a smaller encoding.
	// TODO: Which other code models can use this?
	switch (TM.getCodeModel()) {
	default: break;
	case CodeModel::Small:
	case CodeModel::Kernel:
	if (Subtarget->is64Bit() &&
	AM.Scale == 1 &&
	AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	AM.IndexReg.getNode() == nullptr &&
	AM.SymbolFlags == X86II::MO_NO_FLAG &&
	AM.hasSymbolicDisplacement())
	AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
	break;
	}

	return false;
	}

	bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
	unsigned Depth) {
	// Add an artificial use to this node so that we can keep track of
	// it if it gets CSE'd with a different node.
	HandleSDNode Handle(N);

	X86ISelAddressMode Backup = AM;
	if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
	!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
	return false;
	AM = Backup;

	// Try again after commuting the operands.
	if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
	!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
	return false;
	AM = Backup;

	// If we couldn't fold both operands into the address at the same time,
	// see if we can just put each operand into a register and fold at least
	// the add.
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	!AM.Base_Reg.getNode() &&
	!AM.IndexReg.getNode()) {
	N = Handle.getValue();
	AM.Base_Reg = N.getOperand(0);
	AM.IndexReg = N.getOperand(1);
	AM.Scale = 1;
	return false;
	}
	N = Handle.getValue();
	return true;
	}

	// Insert a node into the DAG at least before the Pos node's position. This
	// will reposition the node as needed, and will assign it a node ID that is <=
	// the Pos node's ID. Note that this does not preserve the uniqueness of node
	// IDs! The selection DAG must no longer depend on their uniqueness when this
	// is used.
	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
	if (N->getNodeId() == -1 \|\|
	(SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
	SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
	DAG.RepositionNode(Pos->getIterator(), N.getNode());
	// Mark Node as invalid for pruning as after this it may be a successor to a
	// selected node but otherwise be in the same position of Pos.
	// Conservatively mark it with the same -abs(Id) to assure node id
	// invariant is preserved.
	N->setNodeId(Pos->getNodeId());
	SelectionDAGISel::InvalidateNodeId(N.getNode());
	}
	}

	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
	// safe. This allows us to convert the shift and and into an h-register
	// extract and a scaled index. Returns false if the simplification is
	// performed.
	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	!Shift.hasOneUse())
	return true;

	int ScaleLog = 8 - Shift.getConstantOperandVal(1);
	if (ScaleLog <= 0 \|\| ScaleLog >= 4 \|\|
	Mask != (0xffu << ScaleLog))
	return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
	SDValue NewMask = DAG.getConstant(0xff, DL, VT);
	SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
	SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, Eight);
	insertDAGNode(DAG, N, Srl);
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, And);
	insertDAGNode(DAG, N, ShlCount);
	insertDAGNode(DAG, N, Shl);
	DAG.ReplaceAllUsesWith(N, Shl);
	DAG.RemoveDeadNode(N.getNode());
	AM.IndexReg = And;
	AM.Scale = (1 << ScaleLog);
	return false;
	}

	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
	// allows us to fold the shift into this addressing mode. Returns false if the
	// transform succeeded.
	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
	X86ISelAddressMode &AM) {
	SDValue Shift = N.getOperand(0);

	// Use a signed mask so that shifting right will insert sign bits. These
	// bits will be removed when we shift the result left so it doesn't matter
	// what we use. This might allow a smaller immediate encoding.
	int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();

	// If we have an any_extend feeding the AND, look through it to see if there
	// is a shift behind it. But only if the AND doesn't use the extended bits.
	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
	bool FoundAnyExtend = false;
	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
	Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
	isUInt<32>(Mask)) {
	FoundAnyExtend = true;
	Shift = Shift.getOperand(0);
	}

	if (Shift.getOpcode() != ISD::SHL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)))
	return true;

	SDValue X = Shift.getOperand(0);

	// Not likely to be profitable if either the AND or SHIFT node has more
	// than one use (unless all uses are for address computation). Besides,
	// isel mechanism requires their node ids to be reused.
	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
	return true;

	// Verify that the shift amount is something we can fold.
	unsigned ShiftAmt = Shift.getConstantOperandVal(1);
	if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
	return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	if (FoundAnyExtend) {
	SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
	insertDAGNode(DAG, N, NewX);
	X = NewX;
	}

	SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
	SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, NewAnd);
	insertDAGNode(DAG, N, NewShift);
	DAG.ReplaceAllUsesWith(N, NewShift);
	DAG.RemoveDeadNode(N.getNode());

	AM.Scale = 1 << ShiftAmt;
	AM.IndexReg = NewAnd;
	return false;
	}

	// Implement some heroics to detect shifts of masked values where the mask can
	// be replaced by extending the shift and undoing that in the addressing mode
	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
	// the addressing mode. This results in code such as:
	//
	// int f(short y, int lookup_table) {
	// ...
	// return y + lookup_table[y >> 11];
	// }
	//
	// Turning into:
	// movzwl (%rdi), %eax
	// movl %eax, %ecx
	// shrl $11, %ecx
	// addl (%rsi,%rcx,4), %eax
	//
	// Instead of:
	// movzwl (%rdi), %eax
	// movl %eax, %ecx
	// shrl $9, %ecx
	// andl $124, %rcx
	// addl (%rsi,%rcx), %eax
	//
	// Note that this function assumes the mask is provided as a mask after the
	// value is shifted. The input chain may or may not match that, but computing
	// such a mask is trivial.
	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)))
	return true;

	unsigned ShiftAmt = Shift.getConstantOperandVal(1);
	unsigned MaskLZ = countLeadingZeros(Mask);
	unsigned MaskTZ = countTrailingZeros(Mask);

	// The amount of shift we're trying to fit into the addressing mode is taken
	// from the trailing zeros of the mask.
	unsigned AMShiftAmt = MaskTZ;

	// There is nothing we can do here unless the mask is removing some bits.
	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
	if (AMShiftAmt <= 0 \|\| AMShiftAmt > 3) return true;

	// We also need to ensure that mask is a continuous run of bits.
	if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;

	// Scale the leading zero count down based on the actual size of the value.
	// Also scale it down based on the size of the shift.
	unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
	if (MaskLZ < ScaleDown)
	return true;
	MaskLZ -= ScaleDown;

	// The final check is to ensure that any masked out high bits of X are
	// already known to be zero. Otherwise, the mask has a semantic impact
	// other than masking out a couple of low bits. Unfortunately, because of
	// the mask, zero extensions will be removed from operands in some cases.
	// This code works extra hard to look through extensions because we can
	// replace them with zero extensions cheaply if necessary.
	bool ReplacingAnyExtend = false;
	if (X.getOpcode() == ISD::ANY_EXTEND) {
	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
	X.getOperand(0).getSimpleValueType().getSizeInBits();
	// Assume that we'll replace the any-extend with a zero-extend, and
	// narrow the search to the extended value.
	X = X.getOperand(0);
	MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
	ReplacingAnyExtend = true;
	}
	APInt MaskedHighBits =
	APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
	KnownBits Known = DAG.computeKnownBits(X);
	if (MaskedHighBits != Known.Zero) return true;

	// We've identified a pattern that can be transformed into a single shift
	// and an addressing mode. Make it so.
	MVT VT = N.getSimpleValueType();
	if (ReplacingAnyExtend) {
	assert(X.getValueType() != VT);
	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
	SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
	insertDAGNode(DAG, N, NewX);
	X = NewX;
	}
	SDLoc DL(N);
	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
	SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewSRLAmt);
	insertDAGNode(DAG, N, NewSRL);
	insertDAGNode(DAG, N, NewSHLAmt);
	insertDAGNode(DAG, N, NewSHL);
	DAG.ReplaceAllUsesWith(N, NewSHL);
	DAG.RemoveDeadNode(N.getNode());

	AM.Scale = 1 << AMShiftAmt;
	AM.IndexReg = NewSRL;
	return false;
	}

	// Transform "(X >> SHIFT) & (MASK << C1)" to
	// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
	// matched to a BEXTR later. Returns false if the simplification is performed.
	static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM,
	const X86Subtarget &Subtarget) {
	if (Shift.getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	!Shift.hasOneUse() \|\| !N.hasOneUse())
	return true;

	// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
	if (!Subtarget.hasTBM() &&
	!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
	return true;

	// We need to ensure that mask is a continuous run of bits.
	if (!isShiftedMask_64(Mask)) return true;

	unsigned ShiftAmt = Shift.getConstantOperandVal(1);

	// The amount of shift we're trying to fit into the addressing mode is taken
	// from the trailing zeros of the mask.
	unsigned AMShiftAmt = countTrailingZeros(Mask);

	// There is nothing we can do here unless the mask is removing some bits.
	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
	if (AMShiftAmt <= 0 \|\| AMShiftAmt > 3) return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
	SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
	SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewSRLAmt);
	insertDAGNode(DAG, N, NewSRL);
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, NewAnd);
	insertDAGNode(DAG, N, NewSHLAmt);
	insertDAGNode(DAG, N, NewSHL);
	DAG.ReplaceAllUsesWith(N, NewSHL);
	DAG.RemoveDeadNode(N.getNode());

	AM.Scale = 1 << AMShiftAmt;
	AM.IndexReg = NewAnd;
	return false;
	}

	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth) {
	SDLoc dl(N);
	LLVM_DEBUG({
	dbgs() << "MatchAddress: ";
	AM.dump(CurDAG);
	});
	// Limit recursion.
	if (Depth > 5)
	return matchAddressBase(N, AM);

	// If this is already a %rip relative address, we can only merge immediates
	// into it. Instead of handling this in every case, we handle it here.
	// RIP relative addressing: %rip + 32-bit displacement!
	if (AM.isRIPRelative()) {
	// FIXME: JumpTable and ExternalSymbol address currently don't like
	// displacements. It isn't very important, but this should be fixed for
	// consistency.
	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -1)
	return true;

	if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
	if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
	return false;
	return true;
	}

	switch (N.getOpcode()) {
	default: break;
	case ISD::LOCAL_RECOVER: {
	if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
	// Use the symbol and don't prefix it.
	AM.MCSym = ESNode->getMCSymbol();
	return false;
	}
	break;
	}
	case ISD::Constant: {
	uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
	if (!foldOffsetIntoAddress(Val, AM))
	return false;
	break;
	}

	case X86ISD::Wrapper:
	case X86ISD::WrapperRIP:
	if (!matchWrapper(N, AM))
	return false;
	break;

	case ISD::LOAD:
	if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
	return false;
	break;

	case ISD::FrameIndex:
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndex(AM.Disp))) {
	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
	AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
	return false;
	}
	break;

	case ISD::SHL:
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1)
	break;

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	unsigned Val = CN->getZExtValue();
	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
	// that the base operand remains free for further matching. If
	// the base doesn't end up getting used, a post-processing step
	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
	if (Val == 1 \|\| Val == 2 \|\| Val == 3) {
	AM.Scale = 1 << Val;
	SDValue ShVal = N.getOperand(0);

	// Okay, we know that we have a scale by now. However, if the scaled
	// value is an add of something and a constant, we can fold the
	// constant into the disp field here.
	if (CurDAG->isBaseWithConstantOffset(ShVal)) {
	AM.IndexReg = ShVal.getOperand(0);
	ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
	uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
	if (!foldOffsetIntoAddress(Disp, AM))
	return false;
	}

	AM.IndexReg = ShVal;
	return false;
	}
	}
	break;

	case ISD::SRL: {
	// Scale must not be used already.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

	// We only handle up to 64-bit values here as those are what matter for
	// addressing mode optimizations.
	assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
	"Unexpected value size!");

	SDValue And = N.getOperand(0);
	if (And.getOpcode() != ISD::AND) break;
	SDValue X = And.getOperand(0);

	// The mask used for the transform is expected to be post-shift, but we
	// found the shift first so just apply the shift to the mask before passing
	// it down.
	if (!isa<ConstantSDNode>(N.getOperand(1)) \|\|
	!isa<ConstantSDNode>(And.getOperand(1)))
	break;
	uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);

	// Try to fold the mask and shift into the scale, and return false if we
	// succeed.
	if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
	return false;
	break;
	}

	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI:
	// A mul_lohi where we need the low part can be folded as a plain multiply.
	if (N.getResNo() != 0) break;
	LLVM_FALLTHROUGH;
	case ISD::MUL:
	case X86ISD::MUL_IMM:
	// X[3,5,9] -> X+X[2,4,8]
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	AM.IndexReg.getNode() == nullptr) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
	if (CN->getZExtValue() == 3 \|\| CN->getZExtValue() == 5 \|\|
	CN->getZExtValue() == 9) {
	AM.Scale = unsigned(CN->getZExtValue())-1;

	SDValue MulVal = N.getOperand(0);
	SDValue Reg;

	// Okay, we know that we have a scale by now. However, if the scaled
	// value is an add of something and a constant, we can fold the
	// constant into the disp field here.
	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
	isa<ConstantSDNode>(MulVal.getOperand(1))) {
	Reg = MulVal.getOperand(0);
	ConstantSDNode *AddVal =
	cast<ConstantSDNode>(MulVal.getOperand(1));
	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
	if (foldOffsetIntoAddress(Disp, AM))
	Reg = N.getOperand(0);
	} else {
	Reg = N.getOperand(0);
	}

	AM.IndexReg = AM.Base_Reg = Reg;
	return false;
	}
	}
	break;

	case ISD::SUB: {
	// Given A-B, if A can be completely folded into the address and
	// the index field with the index field unused, use -B as the index.
	// This is a win if a has multiple parts that can be folded into
	// the address. Also, this saves a mov if the base register has
	// other uses, since it avoids a two-address sub instruction, however
	// it costs an additional mov if the index register has other uses.

	// Add an artificial use to this node so that we can keep track of
	// it if it gets CSE'd with a different node.
	HandleSDNode Handle(N);

	// Test if the LHS of the sub can be folded.
	X86ISelAddressMode Backup = AM;
	if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
	N = Handle.getValue();
	AM = Backup;
	break;
	}
	N = Handle.getValue();
	// Test if the index field is free for use.
	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
	AM = Backup;
	break;
	}

	int Cost = 0;
	SDValue RHS = N.getOperand(1);
	// If the RHS involves a register with multiple uses, this
	// transformation incurs an extra mov, due to the neg instruction
	// clobbering its operand.
	if (!RHS.getNode()->hasOneUse() \|\|
	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
	RHS.getOperand(0).getValueType() == MVT::i32))
	++Cost;
	// If the base is a register with multiple uses, this
	// transformation may save a mov.
	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	--Cost;
	// If the folded LHS was interesting, this transformation saves
	// address arithmetic.
	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
	((AM.Disp != 0) && (Backup.Disp == 0)) +
	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
	--Cost;
	// If it doesn't look like it may be an overall win, don't do it.
	if (Cost >= 0) {
	AM = Backup;
	break;
	}

	// Ok, the transformation is legal and appears profitable. Go for it.
	// Negation will be emitted later to avoid creating dangling nodes if this
	// was an unprofitable LEA.
	AM.IndexReg = RHS;
	AM.NegateIndex = true;
	AM.Scale = 1;
	return false;
	}

	case ISD::ADD:
	if (!matchAdd(N, AM, Depth))
	return false;
	break;

	case ISD::OR:
	// We want to look through a transform in InstCombine and DAGCombiner that
	// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
	// Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
	// An 'lea' can then be used to match the shift (multiply) and add:
	// and $1, %esi
	// lea (%rsi, %rdi, 8), %rax
	if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
	!matchAdd(N, AM, Depth))
	return false;
	break;

	case ISD::AND: {
	// Perform some heroic transforms on an and of a constant-count shift
	// with a constant to enable use of the scaled offset field.

	// Scale must not be used already.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

	// We only handle up to 64-bit values here as those are what matter for
	// addressing mode optimizations.
	assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
	"Unexpected value size!");

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	break;

	if (N.getOperand(0).getOpcode() == ISD::SRL) {
	SDValue Shift = N.getOperand(0);
	SDValue X = Shift.getOperand(0);

	uint64_t Mask = N.getConstantOperandVal(1);

	// Try to fold the mask and shift into an extract and scale.
	if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
	return false;

	// Try to fold the mask and shift directly into the scale.
	if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
	return false;

	// Try to fold the mask and shift into BEXTR and scale.
	if (!foldMaskedShiftToBEXTR(CurDAG, N, Mask, Shift, X, AM, Subtarget))
	return false;
	}

	// Try to swap the mask and shift to place shifts which can be done as
	// a scale on the outside of the mask.
	if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
	return false;

	break;
	}
	case ISD::ZERO_EXTEND: {
	// Try to widen a zexted shift left to the same size as its use, so we can
	// match the shift as a scale factor.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1)
	break;
	if (N.getOperand(0).getOpcode() != ISD::SHL \|\| !N.getOperand(0).hasOneUse())
	break;

	// Give up if the shift is not a valid scale factor [1,2,3].
	SDValue Shl = N.getOperand(0);
	auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
	if (!ShAmtC \|\| ShAmtC->getZExtValue() > 3)
	break;

	// The narrow shift must only shift out zero bits (it must be 'nuw').
	// That makes it safe to widen to the destination type.
	APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
	ShAmtC->getZExtValue());
	if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
	break;

	// zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
	SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));

	// Convert the shift to scale factor.
	AM.Scale = 1 << ShAmtC->getZExtValue();
	AM.IndexReg = Zext;

	insertDAGNode(*CurDAG, N, Zext);
	insertDAGNode(*CurDAG, N, NewShl);
	CurDAG->ReplaceAllUsesWith(N, NewShl);
	CurDAG->RemoveDeadNode(N.getNode());
	return false;
	}
	}

	return matchAddressBase(N, AM);
	}

	/// Helper for MatchAddress. Add the specified node to the
	/// specified addressing mode without any further recursion.
	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
	// Is the base register already occupied?
	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
	// If so, check to see if the scale index register is set.
	if (!AM.IndexReg.getNode()) {
	AM.IndexReg = N;
	AM.Scale = 1;
	return false;
	}

	// Otherwise, we cannot select it.
	return true;
	}

	// Default, generate it as a register.
	AM.BaseType = X86ISelAddressMode::RegBase;
	AM.Base_Reg = N;
	return false;
	}

	/// Helper for selectVectorAddr. Handles things that can be folded into a
	/// gather scatter address. The index register and scale should have already
	/// been handled.
	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
	// TODO: Support other operations.
	switch (N.getOpcode()) {
	case ISD::Constant: {
	uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
	if (!foldOffsetIntoAddress(Val, AM))
	return false;
	break;
	}
	case X86ISD::Wrapper:
	if (!matchWrapper(N, AM))
	return false;
	break;
	}

	return matchAddressBase(N, AM);
	}

	bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	X86ISelAddressMode AM;
	auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
	AM.IndexReg = Mgs->getIndex();
	AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();

	unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
	if (AddrSpace == X86AS::GS)
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	if (AddrSpace == X86AS::FS)
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	if (AddrSpace == X86AS::SS)
	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);

	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	// Try to match into the base and displacement fields.
	if (matchVectorAddress(N, AM))
	return false;

	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	/// Returns true if it is able to pattern match an addressing mode.
	/// It returns the operands which make up the maximal addressing mode it can
	/// match by reference.
	///
	/// Parent is the parent node of the addr operand that is being matched. It
	/// is always a load, store, atomic node, or null. It is only null when
	/// checking memory operands for inline asm nodes.
	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	X86ISelAddressMode AM;

	if (Parent &&
	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
	// that are not a MemSDNode, and thus don't have proper addrspace info.
	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
	Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
	Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
	unsigned AddrSpace =
	cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
	// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
	if (AddrSpace == 256)
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	if (AddrSpace == 257)
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	if (AddrSpace == 258)
	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
	}

	// Save the DL and VT before calling matchAddress, it can invalidate N.
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	if (matchAddress(N, AM))
	return false;

	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	// We can only fold a load if all nodes between it and the root node have a
	// single use. If there are additional uses, we could end up duplicating the
	// load.
	static bool hasSingleUsesFromRoot(SDNode Root, SDNode User) {
	while (User != Root) {
	if (!User->hasOneUse())
	return false;
	User = *User->use_begin();
	}

	return true;
	}

	/// Match a scalar SSE load. In particular, we want to match a load whose top
	/// elements are either undef or zeros. The load flavor is derived from the
	/// type of N, which is either v4f32 or v2f64.
	///
	/// We also return:
	/// PatternChainNode: this is the matched node that has a chain input and
	/// output.
	bool X86DAGToDAGISel::selectScalarSSELoad(SDNode Root, SDNode Parent,
	SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment,
	SDValue &PatternNodeWithChain) {
	if (!hasSingleUsesFromRoot(Root, Parent))
	return false;

	// We can allow a full vector load here since narrowing a load is ok unless
	// it's volatile or atomic.
	if (ISD::isNON_EXTLoad(N.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	if (LD->isSimple() &&
	IsProfitableToFold(N, LD, Root) &&
	IsLegalToFold(N, Parent, Root, OptLevel)) {
	PatternNodeWithChain = N;
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// We can also match the special zero extended load opcode.
	if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
	PatternNodeWithChain = N;
	if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
	auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
	return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
	// once. Otherwise the load might get duplicated and the chain output of the
	// duplicate load will not be observed by all dependencies.
	if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
	PatternNodeWithChain = N.getOperand(0);
	if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
	IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
	LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	return false;
	}


	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
	if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
	uint64_t ImmVal = CN->getZExtValue();
	if (!isUInt<32>(ImmVal))
	return false;

	Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
	return true;
	}

	// In static codegen with small code model, we can get the address of a label
	// into a register with 'movl'
	if (N->getOpcode() != X86ISD::Wrapper)
	return false;

	N = N.getOperand(0);

	// At least GNU as does not accept 'movl' for TPOFF relocations.
	// FIXME: We could use 'movl' when we know we are targeting MC.
	if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;

	Imm = N;
	if (N->getOpcode() != ISD::TargetGlobalAddress)
	return TM.getCodeModel() == CodeModel::Small;

	Optional<ConstantRange> CR =
	cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
	if (!CR)
	return TM.getCodeModel() == CodeModel::Small;

	return CR->getUnsignedMax().ult(1ull << 32);
	}

	bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
	SDLoc DL(N);

	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
	return false;

	RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
	if (RN && RN->getReg() == 0)
	Base = CurDAG->getRegister(0, MVT::i64);
	else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
	// Base could already be %rip, particularly in the x32 ABI.
	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
	MVT::i64), 0);
	Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
	Base);
	}

	RN = dyn_cast<RegisterSDNode>(Index);
	if (RN && RN->getReg() == 0)
	Index = CurDAG->getRegister(0, MVT::i64);
	else {
	assert(Index.getValueType() == MVT::i32 &&
	"Expect to be extending 32-bit registers for use in LEA");
	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
	MVT::i64), 0);
	Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
	Index);
	}

	return true;
	}

	/// Calls SelectAddr and determines if the maximal addressing
	/// mode it matches can be cost effectively emitted as an LEA instruction.
	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	X86ISelAddressMode AM;

	// Save the DL and VT before calling matchAddress, it can invalidate N.
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
	// segments.
	SDValue Copy = AM.Segment;
	SDValue T = CurDAG->getRegister(0, MVT::i32);
	AM.Segment = T;
	if (matchAddress(N, AM))
	return false;
	assert (T == AM.Segment);
	AM.Segment = Copy;

	unsigned Complexity = 0;
	if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
	Complexity = 1;
	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	Complexity = 4;

	if (AM.IndexReg.getNode())
	Complexity++;

	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
	// a simple shift.
	if (AM.Scale > 1)
	Complexity++;

	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
	// to a LEA. This is determined with some experimentation but is by no means
	// optimal (especially for code size consideration). LEA is nice because of
	// its three-address nature. Tweak the cost function again when we can run
	// convertToThreeAddress() at register allocation time.
	if (AM.hasSymbolicDisplacement()) {
	// For X86-64, always use LEA to materialize RIP-relative addresses.
	if (Subtarget->is64Bit())
	Complexity = 4;
	else
	Complexity += 2;
	}

	// Heuristic: try harder to form an LEA from ADD if the operands set flags.
	// Unlike ADD, LEA does not affect flags, so we will be less likely to require
	// duplicating flag-producing instructions later in the pipeline.
	if (N.getOpcode() == ISD::ADD) {
	auto isMathWithFlags = [](SDValue V) {
	switch (V.getOpcode()) {
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::ADC:
	case X86ISD::SBB:
	/* TODO: These opcodes can be added safely, but we may want to justify
	their inclusion for different reasons (better for reg-alloc).
	case X86ISD::SMUL:
	case X86ISD::UMUL:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	*/
	// Value 1 is the flag output of the node - verify it's not dead.
	return !SDValue(V.getNode(), 1).use_empty();
	default:
	return false;
	}
	};
	// TODO: This could be an 'or' rather than 'and' to make the transform more
	// likely to happen. We might want to factor in whether there's a
	// load folding opportunity for the math op that disappears with LEA.
	if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
	Complexity++;
	}

	if (AM.Disp)
	Complexity++;

	// If it isn't worth using an LEA, reject it.
	if (Complexity <= 2)
	return false;

	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	/// This is only run on TargetGlobalTLSAddress nodes.
	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);

	X86ISelAddressMode AM;
	AM.GV = GA->getGlobal();
	AM.Disp += GA->getOffset();
	AM.SymbolFlags = GA->getTargetFlags();

	MVT VT = N.getSimpleValueType();
	if (VT == MVT::i32) {
	AM.Scale = 1;
	AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
	}

	getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
	if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
	Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
	N.getValueType());
	return true;
	}

	// Keep track of the original value type and whether this value was
	// truncated. If we see a truncation from pointer type to VT that truncates
	// bits that are known to be zero, we can use a narrow reference.
	EVT VT = N.getValueType();
	bool WasTruncated = false;
	if (N.getOpcode() == ISD::TRUNCATE) {
	WasTruncated = true;
	N = N.getOperand(0);
	}

	if (N.getOpcode() != X86ISD::Wrapper)
	return false;

	// We can only use non-GlobalValues as immediates if they were not truncated,
	// as we do not have any range information. If we have a GlobalValue and the
	// address was not truncated, we can select it as an operand directly.
	unsigned Opc = N.getOperand(0)->getOpcode();
	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
	Op = N.getOperand(0);
	// We can only select the operand directly if we didn't have to look past a
	// truncate.
	return !WasTruncated;
	}

	// Check that the global's range fits into VT.
	auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
	Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
	if (!CR \|\| CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
	return false;

	// Okay, we can use a narrow reference.
	Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
	GA->getOffset(), GA->getTargetFlags());
	return true;
	}

	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	assert(Root && P && "Unknown root/parent nodes");
	if (!ISD::isNON_EXTLoad(N.getNode()) \|\|
	!IsProfitableToFold(N, P, Root) \|\|
	!IsLegalToFold(N, P, Root, OptLevel))
	return false;

	return selectAddr(N.getNode(),
	N.getOperand(1), Base, Scale, Index, Disp, Segment);
	}

	bool X86DAGToDAGISel::tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	assert(Root && P && "Unknown root/parent nodes");
	if (N->getOpcode() != X86ISD::VBROADCAST_LOAD \|\|
	!IsProfitableToFold(N, P, Root) \|\|
	!IsLegalToFold(N, P, Root, OptLevel))
	return false;

	return selectAddr(N.getNode(),
	N.getOperand(1), Base, Scale, Index, Disp, Segment);
	}

	/// Return an SDNode that returns the value of the global base register.
	/// Output instructions required to initialize the global base register,
	/// if necessary.
	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
	unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
	auto &DL = MF->getDataLayout();
	return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
	}

	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
	if (N->getOpcode() == ISD::TRUNCATE)
	N = N->getOperand(0).getNode();
	if (N->getOpcode() != X86ISD::Wrapper)
	return false;

	auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
	if (!GA)
	return false;

	Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
	return CR && CR->getSignedMin().sge(-1ull << Width) &&
	CR->getSignedMax().slt(1ull << Width);
	}

	static X86::CondCode getCondFromNode(SDNode *N) {
	assert(N->isMachineOpcode() && "Unexpected node");
	X86::CondCode CC = X86::COND_INVALID;
	unsigned Opc = N->getMachineOpcode();
	if (Opc == X86::JCC_1)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
	else if (Opc == X86::SETCCr)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
	else if (Opc == X86::SETCCm)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
	else if (Opc == X86::CMOV16rr \|\| Opc == X86::CMOV32rr \|\|
	Opc == X86::CMOV64rr)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
	else if (Opc == X86::CMOV16rm \|\| Opc == X86::CMOV32rm \|\|
	Opc == X86::CMOV64rm)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));

	return CC;
	}

	/// Test whether the given X86ISD::CMP node has any users that use a flag
	/// other than ZF.
	bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != Flags.getResNo())
	continue;
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (UI->getOpcode() != ISD::CopyToReg \|\|
	cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(),
	FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1) continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode()) return false;
	// Examine the condition code of the user.
	X86::CondCode CC = getCondFromNode(*FlagUI);

	switch (CC) {
	// Comparisons which only use the zero flag.
	case X86::COND_E: case X86::COND_NE:
	continue;
	// Anything else: assume conservatively.
	default:
	return false;
	}
	}
	}
	return true;
	}

	/// Test whether the given X86ISD::CMP node has any uses which require the SF
	/// flag to be accurate.
	bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != Flags.getResNo())
	continue;
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (UI->getOpcode() != ISD::CopyToReg \|\|
	cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(),
	FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1) continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode()) return false;
	// Examine the condition code of the user.
	X86::CondCode CC = getCondFromNode(*FlagUI);

	switch (CC) {
	// Comparisons which don't examine the SF flag.
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_E: case X86::COND_NE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_P: case X86::COND_NP:
	continue;
	// Anything else: assume conservatively.
	default:
	return false;
	}
	}
	}
	return true;
	}

	static bool mayUseCarryFlag(X86::CondCode CC) {
	switch (CC) {
	// Comparisons which don't examine the CF flag.
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_E: case X86::COND_NE:
	case X86::COND_S: case X86::COND_NS:
	case X86::COND_P: case X86::COND_NP:
	case X86::COND_L: case X86::COND_GE:
	case X86::COND_G: case X86::COND_LE:
	return false;
	// Anything else: assume conservatively.
	default:
	return true;
	}
	}

	/// Test whether the given node which sets flags has any uses which require the
	/// CF flag to be accurate.
	bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != Flags.getResNo())
	continue;

	unsigned UIOpc = UI->getOpcode();

	if (UIOpc == ISD::CopyToReg) {
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
	FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1)
	continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode())
	return false;
	// Examine the condition code of the user.
	X86::CondCode CC = getCondFromNode(*FlagUI);

	if (mayUseCarryFlag(CC))
	return false;
	}

	// This CopyToReg is ok. Move on to the next user.
	continue;
	}

	// This might be an unselected node. So look for the pre-isel opcodes that
	// use flags.
	unsigned CCOpNo;
	switch (UIOpc) {
	default:
	// Something unusual. Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
	if (mayUseCarryFlag(CC))
	return false;
	}
	return true;
	}

	/// Check whether or not the chain ending in StoreNode is suitable for doing
	/// the {load; op; store} to modify transformation.
	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
	SDValue StoredVal, SelectionDAG *CurDAG,
	unsigned LoadOpNo,
	LoadSDNode *&LoadNode,
	SDValue &InputChain) {
	// Is the stored value result 0 of the operation?
	if (StoredVal.getResNo() != 0) return false;

	// Are there other uses of the operation other than the store?
	if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;

	// Is the store non-extending and non-indexed?
	if (!ISD::isNormalStore(StoreNode) \|\| StoreNode->isNonTemporal())
	return false;

	SDValue Load = StoredVal->getOperand(LoadOpNo);
	// Is the stored value a non-extending and non-indexed load?
	if (!ISD::isNormalLoad(Load.getNode())) return false;

	// Return LoadNode by reference.
	LoadNode = cast<LoadSDNode>(Load);

	// Is store the only read of the loaded value?
	if (!Load.hasOneUse())
	return false;

	// Is the address of the store the same as the load?
	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
	LoadNode->getOffset() != StoreNode->getOffset())
	return false;

	bool FoundLoad = false;
	SmallVector<SDValue, 4> ChainOps;
	SmallVector<const SDNode *, 4> LoopWorklist;
	SmallPtrSet<const SDNode *, 16> Visited;
	const unsigned int Max = 1024;

	// Visualization of Load-Op-Store fusion:
	// -------------------------
	// Legend:
	// *-lines = Chain operand dependencies.
	// \|-lines = Normal operand dependencies.
	// Dependencies flow down and right. n-suffix references multiple nodes.
	//
	// C Xn C
	// * * *
	// * * *
	// Xn A-LD Yn TF Yn
	// * * \ \| * \|
	// * * \ \| * \|
	// * * \ \| => A--LD_OP_ST
	// * * \\| \
	// TF OP \
	// * \| \ Zn
	// * \| \
	// A-ST Zn
	//

	// This merge induced dependences from: #1: Xn -> LD, OP, Zn
	// #2: Yn -> LD
	// #3: ST -> Zn

	// Ensure the transform is safe by checking for the dual
	// dependencies to make sure we do not induce a loop.

	// As LD is a predecessor to both OP and ST we can do this by checking:
	// a). if LD is a predecessor to a member of Xn or Yn.
	// b). if a Zn is a predecessor to ST.

	// However, (b) can only occur through being a chain predecessor to
	// ST, which is the same as Zn being a member or predecessor of Xn,
	// which is a subset of LD being a predecessor of Xn. So it's
	// subsumed by check (a).

	SDValue Chain = StoreNode->getChain();

	// Gather X elements in ChainOps.
	if (Chain == Load.getValue(1)) {
	FoundLoad = true;
	ChainOps.push_back(Load.getOperand(0));
	} else if (Chain.getOpcode() == ISD::TokenFactor) {
	for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
	SDValue Op = Chain.getOperand(i);
	if (Op == Load.getValue(1)) {
	FoundLoad = true;
	// Drop Load, but keep its chain. No cycle check necessary.
	ChainOps.push_back(Load.getOperand(0));
	continue;
	}
	LoopWorklist.push_back(Op.getNode());
	ChainOps.push_back(Op);
	}
	}

	if (!FoundLoad)
	return false;

	// Worklist is currently Xn. Add Yn to worklist.
	for (SDValue Op : StoredVal->ops())
	if (Op.getNode() != LoadNode)
	LoopWorklist.push_back(Op.getNode());

	// Check (a) if Load is a predecessor to Xn + Yn
	if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
	true))
	return false;

	InputChain =
	CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
	return true;
	}

	// Change a chain of {load; op; store} of the same value into a simple op
	// through memory of that value, if the uses of the modified value and its
	// address are suitable.
	//
	// The tablegen pattern memory operand pattern is currently not able to match
	// the case where the EFLAGS on the original operation are used.
	//
	// To move this to tablegen, we'll need to improve tablegen to allow flags to
	// be transferred from a node in the pattern to the result node, probably with
	// a new keyword. For example, we have this
	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
	// (implicit EFLAGS)]>;
	// but maybe need something like this
	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
	// (transferrable EFLAGS)]>;
	//
	// Until then, we manually fold these and instruction select the operation
	// here.
	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
	StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
	SDValue StoredVal = StoreNode->getOperand(1);
	unsigned Opc = StoredVal->getOpcode();

	// Before we try to select anything, make sure this is memory operand size
	// and opcode we can handle. Note that this must match the code below that
	// actually lowers the opcodes.
	EVT MemVT = StoreNode->getMemoryVT();
	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
	MemVT != MVT::i8)
	return false;

	bool IsCommutable = false;
	bool IsNegate = false;
	switch (Opc) {
	default:
	return false;
	case X86ISD::SUB:
	IsNegate = isNullConstant(StoredVal.getOperand(0));
	break;
	case X86ISD::SBB:
	break;
	case X86ISD::ADD:
	case X86ISD::ADC:
	case X86ISD::AND:
	case X86ISD::OR:
	case X86ISD::XOR:
	IsCommutable = true;
	break;
	}

	unsigned LoadOpNo = IsNegate ? 1 : 0;
	LoadSDNode *LoadNode = nullptr;
	SDValue InputChain;
	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
	LoadNode, InputChain)) {
	if (!IsCommutable)
	return false;

	// This operation is commutable, try the other operand.
	LoadOpNo = 1;
	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
	LoadNode, InputChain))
	return false;
	}

	SDValue Base, Scale, Index, Disp, Segment;
	if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
	Segment))
	return false;

	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
	unsigned Opc8) {
	switch (MemVT.getSimpleVT().SimpleTy) {
	case MVT::i64:
	return Opc64;
	case MVT::i32:
	return Opc32;
	case MVT::i16:
	return Opc16;
	case MVT::i8:
	return Opc8;
	default:
	llvm_unreachable("Invalid size!");
	}
	};

	MachineSDNode *Result;
	switch (Opc) {
	case X86ISD::SUB:
	// Handle negate.
	if (IsNegate) {
	unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
	X86::NEG8m);
	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
	MVT::Other, Ops);
	break;
	}
	LLVM_FALLTHROUGH;
	case X86ISD::ADD:
	// Try to match inc/dec.
	if (!Subtarget->slowIncDec() \|\| CurDAG->shouldOptForSize()) {
	bool IsOne = isOneConstant(StoredVal.getOperand(1));
	bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
	// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
	if ((IsOne \|\| IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
	unsigned NewOpc =
	((Opc == X86ISD::ADD) == IsOne)
	? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
	: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
	MVT::Other, Ops);
	break;
	}
	}
	LLVM_FALLTHROUGH;
	case X86ISD::ADC:
	case X86ISD::SBB:
	case X86ISD::AND:
	case X86ISD::OR:
	case X86ISD::XOR: {
	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
	X86::ADD8mr);
	case X86ISD::ADC:
	return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
	X86::ADC8mr);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
	X86::SUB8mr);
	case X86ISD::SBB:
	return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
	X86::SBB8mr);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
	X86::AND8mr);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
	X86::XOR8mr);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};
	auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
	case X86ISD::ADC:
	return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
	case X86ISD::SBB:
	return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};
	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
	X86::ADD8mi);
	case X86ISD::ADC:
	return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
	X86::ADC8mi);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
	X86::SUB8mi);
	case X86ISD::SBB:
	return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
	X86::SBB8mi);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
	X86::AND8mi);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
	X86::OR8mi);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
	X86::XOR8mi);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};

	unsigned NewOpc = SelectRegOpcode(Opc);
	SDValue Operand = StoredVal->getOperand(1-LoadOpNo);

	// See if the operand is a constant that we can fold into an immediate
	// operand.
	if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
	int64_t OperandV = OperandC->getSExtValue();

	// Check if we can shrink the operand enough to fit in an immediate (or
	// fit into a smaller immediate) by negating it and switching the
	// operation.
	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
	((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) \|\|
	(MemVT == MVT::i64 && !isInt<32>(OperandV) &&
	isInt<32>(-OperandV))) &&
	hasNoCarryFlagUses(StoredVal.getValue(1))) {
	OperandV = -OperandV;
	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
	}

	// First try to fit this into an Imm8 operand. If it doesn't fit, then try
	// the larger immediate operand.
	if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
	Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
	NewOpc = SelectImm8Opcode(Opc);
	} else if (MemVT != MVT::i64 \|\| isInt<32>(OperandV)) {
	Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
	NewOpc = SelectImmOpcode(Opc);
	}
	}

	if (Opc == X86ISD::ADC \|\| Opc == X86ISD::SBB) {
	SDValue CopyTo =
	CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
	StoredVal.getOperand(2), SDValue());

	const SDValue Ops[] = {Base, Scale, Index, Disp,
	Segment, Operand, CopyTo, CopyTo.getValue(1)};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
	Ops);
	} else {
	const SDValue Ops[] = {Base, Scale, Index, Disp,
	Segment, Operand, InputChain};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
	Ops);
	}
	break;
	}
	default:
	llvm_unreachable("Invalid opcode!");
	}

	MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
	LoadNode->getMemOperand()};
	CurDAG->setNodeMemRefs(Result, MemOps);

	// Update Load Chain uses as well.
	ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
	ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
	ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
	CurDAG->RemoveDeadNode(Node);
	return true;
	}

	// See if this is an X & Mask that we can match to BEXTR/BZHI.
	// Where Mask is one of the following patterns:
	// a) x & (1 << nbits) - 1
	// b) x & ~(-1 << nbits)
	// c) x & (-1 >> (32 - y))
	// d) x << (32 - y) >> (32 - y)
	bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
	assert(
	(Node->getOpcode() == ISD::AND \|\| Node->getOpcode() == ISD::SRL) &&
	"Should be either an and-mask, or right-shift after clearing high bits.");

	// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
	if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
	return false;

	MVT NVT = Node->getSimpleValueType(0);

	// Only supported for 32 and 64 bits.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return false;

	SDValue NBits;

	// If we have BMI2's BZHI, we are ok with muti-use patterns.
	// Else, if we only have BMI1's BEXTR, we require one-use.
	const bool CanHaveExtraUses = Subtarget->hasBMI2();
	auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
	return CanHaveExtraUses \|\|
	Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
	};
	auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
	auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };

	auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
	if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
	assert(V.getSimpleValueType() == MVT::i32 &&
	V.getOperand(0).getSimpleValueType() == MVT::i64 &&
	"Expected i64 -> i32 truncation");
	V = V.getOperand(0);
	}
	return V;
	};

	// a) x & ((1 << nbits) + (-1))
	auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
	&NBits](SDValue Mask) -> bool {
	// Match `add`. Must only have one use!
	if (Mask->getOpcode() != ISD::ADD \|\| !checkOneUse(Mask))
	return false;
	// We should be adding all-ones constant (i.e. subtracting one.)
	if (!isAllOnesConstant(Mask->getOperand(1)))
	return false;
	// Match `1 << nbits`. Might be truncated. Must only have one use!
	SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
	if (M0->getOpcode() != ISD::SHL \|\| !checkOneUse(M0))
	return false;
	if (!isOneConstant(M0->getOperand(0)))
	return false;
	NBits = M0->getOperand(1);
	return true;
	};

	auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
	V = peekThroughOneUseTruncation(V);
	return CurDAG->MaskedValueIsAllOnes(
	V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
	NVT.getSizeInBits()));
	};

	// b) x & ~(-1 << nbits)
	auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
	&NBits](SDValue Mask) -> bool {
	// Match `~()`. Must only have one use!
	if (Mask.getOpcode() != ISD::XOR \|\| !checkOneUse(Mask))
	return false;
	// The -1 only has to be all-ones for the final Node's NVT.
	if (!isAllOnes(Mask->getOperand(1)))
	return false;
	// Match `-1 << nbits`. Might be truncated. Must only have one use!
	SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
	if (M0->getOpcode() != ISD::SHL \|\| !checkOneUse(M0))
	return false;
	// The -1 only has to be all-ones for the final Node's NVT.
	if (!isAllOnes(M0->getOperand(0)))
	return false;
	NBits = M0->getOperand(1);
	return true;
	};

	// Match potentially-truncated (bitwidth - y)
	auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
	unsigned Bitwidth) {
	// Skip over a truncate of the shift amount.
	if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
	ShiftAmt = ShiftAmt.getOperand(0);
	// The trunc should have been the only user of the real shift amount.
	if (!checkOneUse(ShiftAmt))
	return false;
	}
	// Match the shift amount as: (bitwidth - y). It should go away, too.
	if (ShiftAmt.getOpcode() != ISD::SUB)
	return false;
	auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
	if (!V0 \|\| V0->getZExtValue() != Bitwidth)
	return false;
	NBits = ShiftAmt.getOperand(1);
	return true;
	};

	// c) x & (-1 >> (32 - y))
	auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
	matchShiftAmt](SDValue Mask) -> bool {
	// The mask itself may be truncated.
	Mask = peekThroughOneUseTruncation(Mask);
	unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
	// Match `l>>`. Must only have one use!
	if (Mask.getOpcode() != ISD::SRL \|\| !checkOneUse(Mask))
	return false;
	// We should be shifting truly all-ones constant.
	if (!isAllOnesConstant(Mask.getOperand(0)))
	return false;
	SDValue M1 = Mask.getOperand(1);
	// The shift amount should not be used externally.
	if (!checkOneUse(M1))
	return false;
	return matchShiftAmt(M1, Bitwidth);
	};

	SDValue X;

	// d) x << (32 - y) >> (32 - y)
	auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
	&X](SDNode *Node) -> bool {
	if (Node->getOpcode() != ISD::SRL)
	return false;
	SDValue N0 = Node->getOperand(0);
	if (N0->getOpcode() != ISD::SHL \|\| !checkOneUse(N0))
	return false;
	unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
	SDValue N1 = Node->getOperand(1);
	SDValue N01 = N0->getOperand(1);
	// Both of the shifts must be by the exact same value.
	// There should not be any uses of the shift amount outside of the pattern.
	if (N1 != N01 \|\| !checkTwoUse(N1))
	return false;
	if (!matchShiftAmt(N1, Bitwidth))
	return false;
	X = N0->getOperand(0);
	return true;
	};

	auto matchLowBitMask = [matchPatternA, matchPatternB,
	matchPatternC](SDValue Mask) -> bool {
	return matchPatternA(Mask) \|\| matchPatternB(Mask) \|\| matchPatternC(Mask);
	};

	if (Node->getOpcode() == ISD::AND) {
	X = Node->getOperand(0);
	SDValue Mask = Node->getOperand(1);

	if (matchLowBitMask(Mask)) {
	// Great.
	} else {
	std::swap(X, Mask);
	if (!matchLowBitMask(Mask))
	return false;
	}
	} else if (!matchPatternD(Node))
	return false;

	SDLoc DL(Node);

	// Truncate the shift amount.
	NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
	insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);

	// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
	// All the other bits are undefined, we do not care about them.
	SDValue ImplDef = SDValue(
	CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
	insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);

	SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
	insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
	NBits = SDValue(
	CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
	NBits, SRIdxVal), 0);
	insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);

	if (Subtarget->hasBMI2()) {
	// Great, just emit the the BZHI..
	if (NVT != MVT::i32) {
	// But have to place the bit count into the wide-enough register first.
	NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
	insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
	}

	SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
	ReplaceNode(Node, Extract.getNode());
	SelectCode(Extract.getNode());
	return true;
	}

	// Else, if we do NOT have BMI2, let's find out if the if the 'X' is
	// logically shifted (potentially with one-use trunc inbetween),
	// and the truncation was the only use of the shift,
	// and if so look past one-use truncation.
	{
	SDValue RealX = peekThroughOneUseTruncation(X);
	// FIXME: only if the shift is one-use?
	if (RealX != X && RealX.getOpcode() == ISD::SRL)
	X = RealX;
	}

	MVT XVT = X.getSimpleValueType();

	// Else, emitting BEXTR requires one more step.
	// The 'control' of BEXTR has the pattern of:
	// [15...8 bit][ 7...0 bit] location
	// [ bit count][ shift] name
	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11

	// Shift NBits left by 8 bits, thus producing 'control'.
	// This makes the low 8 bits to be zero.
	SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
	SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
	insertDAGNode(*CurDAG, SDValue(Node, 0), Control);

	// If the 'X' is logically shifted, we can fold that shift into 'control'.
	// FIXME: only if the shift is one-use?
	if (X.getOpcode() == ISD::SRL) {
	SDValue ShiftAmt = X.getOperand(1);
	X = X.getOperand(0);

	assert(ShiftAmt.getValueType() == MVT::i8 &&
	"Expected shift amount to be i8");

	// Now, zero-extend the shift amount. The bits 8...15 must be zero!
	// We could zext to i16 in some form, but we intentionally don't do that.
	SDValue OrigShiftAmt = ShiftAmt;
	ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
	insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);

	// And now 'or' these low 8 bits of shift amount into the 'control'.
	Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
	insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
	}

	// But have to place the 'control' into the wide-enough register first.
	if (XVT != MVT::i32) {
	Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
	insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
	}

	// And finally, form the BEXTR itself.
	SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);

	// The 'X' was originally truncated. Do that now.
	if (XVT != NVT) {
	insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
	Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
	}

	ReplaceNode(Node, Extract.getNode());
	SelectCode(Extract.getNode());

	return true;
	}

	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
	MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
	MVT NVT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	// If we have TBM we can use an immediate for the control. If we have BMI
	// we should only do this if the BEXTR instruction is implemented well.
	// Otherwise moving the control into a register makes this more costly.
	// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
	// hoisting the move immediate would make it worthwhile with a less optimal
	// BEXTR?
	bool PreferBEXTR =
	Subtarget->hasTBM() \|\| (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
	if (!PreferBEXTR && !Subtarget->hasBMI2())
	return nullptr;

	// Must have a shift right.
	if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
	return nullptr;

	// Shift can't have additional users.
	if (!N0->hasOneUse())
	return nullptr;

	// Only supported for 32 and 64 bits.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return nullptr;

	// Shift amount and RHS of and must be constant.
	ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (!MaskCst \|\| !ShiftCst)
	return nullptr;

	// And RHS must be a mask.
	uint64_t Mask = MaskCst->getZExtValue();
	if (!isMask_64(Mask))
	return nullptr;

	uint64_t Shift = ShiftCst->getZExtValue();
	uint64_t MaskSize = countPopulation(Mask);

	// Don't interfere with something that can be handled by extracting AH.
	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
	if (Shift == 8 && MaskSize == 8)
	return nullptr;

	// Make sure we are only using bits that were in the original value, not
	// shifted in.
	if (Shift + MaskSize > NVT.getSizeInBits())
	return nullptr;

	// BZHI, if available, is always fast, unlike BEXTR. But even if we decide
	// that we can't use BEXTR, it is only worthwhile using BZHI if the mask
	// does not fit into 32 bits. Load folding is not a sufficient reason.
	if (!PreferBEXTR && MaskSize <= 32)
	return nullptr;

	SDValue Control;
	unsigned ROpc, MOpc;

	if (!PreferBEXTR) {
	assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
	// If we can't make use of BEXTR then we can't fuse shift+mask stages.
	// Let's perform the mask first, and apply shift later. Note that we need to
	// widen the mask to account for the fact that we'll apply shift afterwards!
	Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
	ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
	MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
	Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
	} else {
	// The 'control' of BEXTR has the pattern of:
	// [15...8 bit][ 7...0 bit] location
	// [ bit count][ shift] name
	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
	Control = CurDAG->getTargetConstant(Shift \| (MaskSize << 8), dl, NVT);
	if (Subtarget->hasTBM()) {
	ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
	MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
	} else {
	assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
	// BMI requires the immediate to placed in a register.
	ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
	MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
	Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
	}
	}

	MachineSDNode *NewNode;
	SDValue Input = N0->getOperand(0);
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = {
	Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
	NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
	} else {
	NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
	}

	if (!PreferBEXTR) {
	// We still need to apply the shift.
	SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
	unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
	NewNode =
	CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
	}

	return NewNode;
	}

	// Emit a PCMISTR(I/M) instruction.
	MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
	bool MayFoldLoad, const SDLoc &dl,
	MVT VT, SDNode *Node) {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);
	SDValue Imm = Node->getOperand(2);
	const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
	Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());

	// Try to fold a load. No need to check alignment.
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
	N1.getOperand(0) };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
	MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	return CNode;
	}

	SDValue Ops[] = { N0, N1, Imm };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
	MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
	return CNode;
	}

	// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
	// to emit a second instruction after this one. This is needed since we have two
	// copyToReg nodes glued before this and we need to continue that glue through.
	MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
	bool MayFoldLoad, const SDLoc &dl,
	MVT VT, SDNode *Node,
	SDValue &InFlag) {
	SDValue N0 = Node->getOperand(0);
	SDValue N2 = Node->getOperand(2);
	SDValue Imm = Node->getOperand(4);
	const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
	Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());

	// Try to fold a load. No need to check alignment.
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
	N2.getOperand(0), InFlag };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
	MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 3);
	// Update the chain.
	ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
	return CNode;
	}

	SDValue Ops[] = { N0, N2, Imm, InFlag };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
	MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 2);
	return CNode;
	}

	bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
	EVT VT = N->getValueType(0);

	// Only handle scalar shifts.
	if (VT.isVector())
	return false;

	// Narrower shifts only mask to 5 bits in hardware.
	unsigned Size = VT == MVT::i64 ? 64 : 32;

	SDValue OrigShiftAmt = N->getOperand(1);
	SDValue ShiftAmt = OrigShiftAmt;
	SDLoc DL(N);

	// Skip over a truncate of the shift amount.
	if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
	ShiftAmt = ShiftAmt->getOperand(0);

	// This function is called after X86DAGToDAGISel::matchBitExtract(),
	// so we are not afraid that we might mess up BZHI/BEXTR pattern.

	SDValue NewShiftAmt;
	if (ShiftAmt->getOpcode() == ISD::ADD \|\| ShiftAmt->getOpcode() == ISD::SUB) {
	SDValue Add0 = ShiftAmt->getOperand(0);
	SDValue Add1 = ShiftAmt->getOperand(1);
	// If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
	// to avoid the ADD/SUB.
	if (isa<ConstantSDNode>(Add1) &&
	cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
	NewShiftAmt = Add0;
	// If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
	// generate a NEG instead of a SUB of a constant.
	} else if (ShiftAmt->getOpcode() == ISD::SUB &&
	isa<ConstantSDNode>(Add0) &&
	cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
	cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
	// Insert a negate op.
	// TODO: This isn't guaranteed to replace the sub if there is a logic cone
	// that uses it that's not a shift.
	EVT SubVT = ShiftAmt.getValueType();
	SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
	SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
	NewShiftAmt = Neg;

	// Insert these operands into a valid topological order so they can
	// get selected independently.
	insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
	insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
	} else
	return false;
	} else
	return false;

	if (NewShiftAmt.getValueType() != MVT::i8) {
	// Need to truncate the shift amount.
	NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
	// Add to a correct topological ordering.
	insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
	}

	// Insert a new mask to keep the shift amount legal. This should be removed
	// by isel patterns.
	NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
	CurDAG->getConstant(Size - 1, DL, MVT::i8));
	// Place in a correct topological ordering.
	insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);

	SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
	NewShiftAmt);
	if (UpdatedNode != N) {
	// If we found an existing node, we should replace ourselves with that node
	// and wait for it to be selected after its other users.
	ReplaceNode(N, UpdatedNode);
	return true;
	}

	// If the original shift amount is now dead, delete it so that we don't run
	// it through isel.
	if (OrigShiftAmt.getNode()->use_empty())
	CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());

	// Now that we've optimized the shift amount, defer to normal isel to get
	// load folding and legacy vs BMI2 selection without repeating it here.
	SelectCode(N);
	return true;
	}

	bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
	MVT NVT = N->getSimpleValueType(0);
	unsigned Opcode = N->getOpcode();
	SDLoc dl(N);

	// For operations of the form (x << C1) op C2, check if we can use a smaller
	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
	SDValue Shift = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
	if (!Cst)
	return false;

	int64_t Val = Cst->getSExtValue();

	// If we have an any_extend feeding the AND, look through it to see if there
	// is a shift behind it. But only if the AND doesn't use the extended bits.
	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
	bool FoundAnyExtend = false;
	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
	Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
	isUInt<32>(Val)) {
	FoundAnyExtend = true;
	Shift = Shift.getOperand(0);
	}

	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse())
	return false;

	// i8 is unshrinkable, i16 should be promoted to i32.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return false;

	ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!ShlCst)
	return false;

	uint64_t ShAmt = ShlCst->getZExtValue();

	// Make sure that we don't change the operation by removing bits.
	// This only matters for OR and XOR, AND is unaffected.
	uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
	return false;

	// Check the minimum bitwidth for the new constant.
	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
	auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
	if (Opcode == ISD::AND) {
	// AND32ri is the same as AND64ri32 with zext imm.
	// Try this before sign extended immediates below.
	ShiftedVal = (uint64_t)Val >> ShAmt;
	if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
	return true;
	// Also swap order when the AND can become MOVZX.
	if (ShiftedVal == UINT8_MAX \|\| ShiftedVal == UINT16_MAX)
	return true;
	}
	ShiftedVal = Val >> ShAmt;
	if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) \|\|
	(!isInt<32>(Val) && isInt<32>(ShiftedVal)))
	return true;
	if (Opcode != ISD::AND) {
	// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
	ShiftedVal = (uint64_t)Val >> ShAmt;
	if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
	return true;
	}
	return false;
	};

	int64_t ShiftedVal;
	if (!CanShrinkImmediate(ShiftedVal))
	return false;

	// Ok, we can reorder to get a smaller immediate.

	// But, its possible the original immediate allowed an AND to become MOVZX.
	// Doing this late due to avoid the MakedValueIsZero call as late as
	// possible.
	if (Opcode == ISD::AND) {
	// Find the smallest zext this could possibly be.
	unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
	ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));

	// Figure out which bits need to be zero to achieve that mask.
	APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
	ZExtWidth);
	NeededMask &= ~Cst->getAPIntValue();

	if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
	return false;
	}

	SDValue X = Shift.getOperand(0);
	if (FoundAnyExtend) {
	SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
	insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
	X = NewX;
	}

	SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
	insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
	SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
	insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
	SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
	Shift.getOperand(1));
	ReplaceNode(N, NewSHL.getNode());
	SelectCode(NewSHL.getNode());
	return true;
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
	assert((Node->getOpcode() == ISD::ADD \|\| Node->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	EVT VT = Node->getValueType(0);
	assert(VT.isVector() && "Should only be called for vectors.");

	SDValue X = Node->getOperand(0);
	SDValue OneVec = Node->getOperand(1);

	APInt SplatVal;
	if (!X86::isConstantSplat(OneVec, SplatVal) \|\| !SplatVal.isOneValue())
	return false;

	SDLoc DL(Node);
	SDValue OneConstant, AllOnesVec;

	APInt Ones = APInt::getAllOnesValue(32);
	assert(VT.getSizeInBits() % 32 == 0 &&
	"Expected bit count to be a multiple of 32");
	OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
	insertDAGNode(*CurDAG, X, OneConstant);

	unsigned NumElts = VT.getSizeInBits() / 32;
	assert(NumElts > 0 && "Expected to get non-empty vector.");
	AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
	DL, OneConstant);
	insertDAGNode(*CurDAG, X, AllOnesVec);

	AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
	insertDAGNode(*CurDAG, X, AllOnesVec);

	unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);

	ReplaceNode(Node, NewNode.getNode());
	SelectCode(NewNode.getNode());
	return true;
	}

	/// If the high bits of an 'and' operand are known zero, try setting the
	/// high bits of an 'and' constant operand to produce a smaller encoding by
	/// creating a small, sign-extended negative immediate rather than a large
	/// positive one. This reverses a transform in SimplifyDemandedBits that
	/// shrinks mask constants by clearing bits. There is also a possibility that
	/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
	/// case, just replace the 'and'. Return 'true' if the node is replaced.
	bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
	// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
	// have immediate operands.
	MVT VT = And->getSimpleValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
	if (!And1C)
	return false;

	// Bail out if the mask constant is already negative. It's can't shrink more.
	// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
	// patterns to use a 32-bit and instead of a 64-bit and by relying on the
	// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
	// are negative too.
	APInt MaskVal = And1C->getAPIntValue();
	unsigned MaskLZ = MaskVal.countLeadingZeros();
	if (!MaskLZ \|\| (VT == MVT::i64 && MaskLZ == 32))
	return false;

	// Don't extend into the upper 32 bits of a 64 bit mask.
	if (VT == MVT::i64 && MaskLZ >= 32) {
	MaskLZ -= 32;
	MaskVal = MaskVal.trunc(32);
	}

	SDValue And0 = And->getOperand(0);
	APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
	APInt NegMaskVal = MaskVal \| HighZeros;

	// If a negative constant would not allow a smaller encoding, there's no need
	// to continue. Only change the constant when we know it's a win.
	unsigned MinWidth = NegMaskVal.getMinSignedBits();
	if (MinWidth > 32 \|\| (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
	return false;

	// Extend masks if we truncated above.
	if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
	NegMaskVal = NegMaskVal.zext(64);
	HighZeros = HighZeros.zext(64);
	}

	// The variable operand must be all zeros in the top bits to allow using the
	// new, negative constant as the mask.
	if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
	return false;

	// Check if the mask is -1. In that case, this is an unnecessary instruction
	// that escaped earlier analysis.
	if (NegMaskVal.isAllOnesValue()) {
	ReplaceNode(And, And0.getNode());
	return true;
	}

	// A negative mask allows a smaller encoding. Create a new 'and' node.
	SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
	SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
	ReplaceNode(And, NewAnd.getNode());
	SelectCode(NewAnd.getNode());
	return true;
	}

	static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
	bool FoldedBCast, bool Masked) {
	if (Masked) {
	if (FoldedLoad) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
	}
	}

	if (FoldedBCast) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
	}
	}

	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
	}
	}

	if (FoldedLoad) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
	}
	}

	if (FoldedBCast) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
	}
	}

	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
	}
	}

	// Try to create VPTESTM instruction. If InMask is not null, it will be used
	// to form a masked operation.
	bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
	SDValue InMask) {
	assert(Subtarget->hasAVX512() && "Expected AVX512!");
	assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected VT!");

	// Look for equal and not equal compares.
	ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return false;

	SDValue SetccOp0 = Setcc.getOperand(0);
	SDValue SetccOp1 = Setcc.getOperand(1);

	// Canonicalize the all zero vector to the RHS.
	if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
	std::swap(SetccOp0, SetccOp1);

	// See if we're comparing against zero.
	if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
	return false;

	SDValue N0 = SetccOp0;

	MVT CmpVT = N0.getSimpleValueType();
	MVT CmpSVT = CmpVT.getVectorElementType();

	// Start with both operands the same. We'll try to refine this.
	SDValue Src0 = N0;
	SDValue Src1 = N0;

	{
	// Look through single use bitcasts.
	SDValue N0Temp = N0;
	if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
	N0Temp = N0.getOperand(0);

	// Look for single use AND.
	if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
	Src0 = N0Temp.getOperand(0);
	Src1 = N0Temp.getOperand(1);
	}
	}

	// Without VLX we need to widen the load.
	bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();

	// We can only fold loads if the sources are unique.
	bool CanFoldLoads = Src0 != Src1;

	// Try to fold loads unless we need to widen.
	bool FoldedLoad = false;
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
	if (!Widen && CanFoldLoads) {
	Load = Src1;
	FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
	Tmp4);
	if (!FoldedLoad) {
	// And is computative.
	Load = Src0;
	FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
	Tmp3, Tmp4);
	if (FoldedLoad)
	std::swap(Src0, Src1);
	}
	}

	auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
	// Look through single use bitcasts.
	if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
	Parent = Src.getNode();
	Src = Src.getOperand(0);
	}

	if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
	if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
	return Src;
	}

	return SDValue();
	};

	// If we didn't fold a load, try to match broadcast. No widening limitation
	// for this. But only 32 and 64 bit types are supported.
	bool FoldedBCast = false;
	if (!FoldedLoad && CanFoldLoads &&
	(CmpSVT == MVT::i32 \|\| CmpSVT == MVT::i64)) {
	SDNode *ParentNode = N0.getNode();
	if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
	FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
	Tmp1, Tmp2, Tmp3, Tmp4);
	}

	// Try the other operand.
	if (!FoldedBCast) {
	SDNode *ParentNode = N0.getNode();
	if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
	FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
	Tmp1, Tmp2, Tmp3, Tmp4);
	if (FoldedBCast)
	std::swap(Src0, Src1);
	}
	}
	}

	auto getMaskRC = [](MVT MaskVT) {
	switch (MaskVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v2i1: return X86::VK2RegClassID;
	case MVT::v4i1: return X86::VK4RegClassID;
	case MVT::v8i1: return X86::VK8RegClassID;
	case MVT::v16i1: return X86::VK16RegClassID;
	case MVT::v32i1: return X86::VK32RegClassID;
	case MVT::v64i1: return X86::VK64RegClassID;
	}
	};

	bool IsMasked = InMask.getNode() != nullptr;

	SDLoc dl(Root);

	MVT ResVT = Setcc.getSimpleValueType();
	MVT MaskVT = ResVT;
	if (Widen) {
	// Widen the inputs using insert_subreg or copy_to_regclass.
	unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
	unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
	unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
	CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
	CmpVT), 0);
	Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);

	assert(!FoldedLoad && "Shouldn't have folded the load");
	if (!FoldedBCast)
	Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);

	if (IsMasked) {
	// Widen the mask.
	unsigned RegClass = getMaskRC(MaskVT);
	SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
	InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
	dl, MaskVT, InMask, RC), 0);
	}
	}

	bool IsTestN = CC == ISD::SETEQ;
	unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
	IsMasked);

	MachineSDNode *CNode;
	if (FoldedLoad \|\| FoldedBCast) {
	SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);

	if (IsMasked) {
	SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
	Load.getOperand(0) };
	CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	} else {
	SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
	Load.getOperand(0) };
	CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	}

	// Update the chain.
	ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
	} else {
	if (IsMasked)
	CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
	else
	CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
	}

	// If we widened, we need to shrink the mask VT.
	if (Widen) {
	unsigned RegClass = getMaskRC(ResVT);
	SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
	CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
	dl, ResVT, SDValue(CNode, 0), RC);
	}

	ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
	CurDAG->RemoveDeadNode(Root);
	return true;
	}

	// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
	// into vpternlog.
	bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
	assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");

	MVT NVT = N->getSimpleValueType(0);

	// Make sure we support VPTERNLOG.
	if (!NVT.isVector() \|\| !Subtarget->hasAVX512())
	return false;

	// We need VLX for 128/256-bit.
	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
	return false;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	if (N0.getOpcode() != ISD::AND \|\|
	N1.getOpcode() != X86ISD::ANDNP \|\|
	!N0.hasOneUse() \|\| !N1.hasOneUse())
	return false;

	// ANDN is not commutable, use it to pick down A and C.
	SDValue A = N1.getOperand(0);
	SDValue C = N1.getOperand(1);

	// AND is commutable, if one operand matches A, the other operand is B.
	// Otherwise this isn't a match.
	SDValue B;
	if (N0.getOperand(0) == A)
	B = N0.getOperand(1);
	else if (N0.getOperand(1) == A)
	B = N0.getOperand(0);
	else
	return false;

	SDLoc dl(N);
	SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
	SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
	ReplaceNode(N, Ternlog.getNode());
	SelectCode(Ternlog.getNode());
	return true;
	}

	void X86DAGToDAGISel::Select(SDNode *Node) {
	MVT NVT = Node->getSimpleValueType(0);
	unsigned Opcode = Node->getOpcode();
	SDLoc dl(Node);

	if (Node->isMachineOpcode()) {
	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
	Node->setNodeId(-1);
	return; // Already selected.
	}

	switch (Opcode) {
	default: break;
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = Node->getConstantOperandVal(1);
	switch (IntNo) {
	default: break;
	case Intrinsic::x86_sse3_monitor:
	case Intrinsic::x86_monitorx:
	case Intrinsic::x86_clzero: {
	bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;

	unsigned Opc = 0;
	switch (IntNo) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::x86_sse3_monitor:
	if (!Subtarget->hasSSE3())
	break;
	Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
	break;
	case Intrinsic::x86_monitorx:
	if (!Subtarget->hasMWAITX())
	break;
	Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
	break;
	case Intrinsic::x86_clzero:
	if (!Subtarget->hasCLZERO())
	break;
	Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
	break;
	}

	if (Opc) {
	unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
	SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
	Node->getOperand(2), SDValue());
	SDValue InFlag = Chain.getValue(1);

	if (IntNo == Intrinsic::x86_sse3_monitor \|\|
	IntNo == Intrinsic::x86_monitorx) {
	// Copy the other two operands to ECX and EDX.
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
	InFlag);
	InFlag = Chain.getValue(1);
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
	InFlag);
	InFlag = Chain.getValue(1);
	}

	MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
	{ Chain, InFlag});
	ReplaceNode(Node, CNode);
	return;
	}

	break;
	}
	}

	break;
	}
	case ISD::BRIND: {
	if (Subtarget->isTargetNaCl())
	// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
	// leave the instruction alone.
	break;
	if (Subtarget->isTarget64BitILP32()) {
	// Converts a 32-bit register to a 64-bit, zero-extended version of
	// it. This is needed because x86-64 can do many things, but jmp %r32
	// ain't one of them.
	const SDValue &Target = Node->getOperand(1);
	assert(Target.getSimpleValueType() == llvm::MVT::i32);
	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
	SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
	Node->getOperand(0), ZextTarget);
	ReplaceNode(Node, Brind.getNode());
	SelectCode(ZextTarget.getNode());
	SelectCode(Brind.getNode());
	return;
	}
	break;
	}
	case X86ISD::GlobalBaseReg:
	ReplaceNode(Node, getGlobalBaseReg());
	return;

	case ISD::BITCAST:
	// Just drop all 128/256/512-bit bitcasts.
	if (NVT.is512BitVector() \|\| NVT.is256BitVector() \|\| NVT.is128BitVector() \|\|
	NVT == MVT::f128) {
	ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	break;

	case ISD::VSELECT: {
	// Replace VSELECT with non-mask conditions with with BLENDV.
	if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	break;

	assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
	SDValue Blendv = CurDAG->getNode(
	X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2));
	ReplaceNode(Node, Blendv.getNode());
	SelectCode(Blendv.getNode());
	// We already called ReplaceUses.
	return;
	}

	case ISD::SRL:
	if (matchBitExtract(Node))
	return;
	LLVM_FALLTHROUGH;
	case ISD::SRA:
	case ISD::SHL:
	if (tryShiftAmountMod(Node))
	return;
	break;

	case ISD::AND:
	if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
	// Try to form a masked VPTESTM. Operands can be in either order.
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);
	if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
	tryVPTESTM(Node, N0, N1))
	return;
	if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
	tryVPTESTM(Node, N1, N0))
	return;
	}

	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
	ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	if (matchBitExtract(Node))
	return;
	if (AndImmShrink && shrinkAndImmediate(Node))
	return;

	LLVM_FALLTHROUGH;
	case ISD::OR:
	case ISD::XOR:
	if (tryShrinkShlLogicImm(Node))
	return;

	if (Opcode == ISD::OR && tryMatchBitSelect(Node))
	return;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB: {
	if ((Opcode == ISD::ADD \|\| Opcode == ISD::SUB) && NVT.isVector() &&
	combineIncDecVector(Node))
	return;

	// Try to avoid folding immediates with multiple uses for optsize.
	// This code tries to select to register form directly to avoid going
	// through the isel table which might fold the immediate. We can't change
	// the patterns on the add/sub/and/or/xor with immediate paterns in the
	// tablegen files to check immediate use count without making the patterns
	// unavailable to the fast-isel table.
	if (!OptForSize)
	break;

	// Only handle i8/i16/i32/i64.
	if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
	break;

	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
	if (!Cst)
	break;

	int64_t Val = Cst->getSExtValue();

	// Make sure its an immediate that is considered foldable.
	// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
	if (!isInt<8>(Val) && !isInt<32>(Val))
	break;

	// If this can match to INC/DEC, let it go.
	if (Opcode == ISD::ADD && (Val == 1 \|\| Val == -1))
	break;

	// Check if we should avoid folding this immediate.
	if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
	break;

	// We should not fold the immediate. So we need a register form instead.
	unsigned ROpc, MOpc;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::i8:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
	case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
	case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
	case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
	case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
	}
	break;
	case MVT::i16:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
	case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
	case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
	case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
	case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
	}
	break;
	case MVT::i32:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
	case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
	case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
	case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
	case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
	}
	break;
	case MVT::i64:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
	case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
	case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
	case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
	case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
	}
	break;
	}

	// Ok this is a AND/OR/XOR/ADD/SUB with constant.

	// If this is a not a subtract, we can still try to fold a load.
	if (Opcode != ISD::SUB) {
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
	MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	}

	CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
	return;
	}

	case X86ISD::SMUL:
	// i16/i32/i64 are handled with isel patterns.
	if (NVT != MVT::i8)
	break;
	LLVM_FALLTHROUGH;
	case X86ISD::UMUL: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned LoReg, ROpc, MOpc;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8:
	LoReg = X86::AL;
	ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
	MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
	break;
	case MVT::i16:
	LoReg = X86::AX;
	ROpc = X86::MUL16r;
	MOpc = X86::MUL16m;
	break;
	case MVT::i32:
	LoReg = X86::EAX;
	ROpc = X86::MUL32r;
	MOpc = X86::MUL32m;
	break;
	case MVT::i64:
	LoReg = X86::RAX;
	ROpc = X86::MUL64r;
	MOpc = X86::MUL64m;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	// Multiply is commmutative.
	if (!FoldedLoad) {
	FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	if (FoldedLoad)
	std::swap(N0, N1);
	}

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
	N0, SDValue()).getValue(1);

	MachineSDNode *CNode;
	if (FoldedLoad) {
	// i16/i32/i64 use an instruction that produces a low and high result even
	// though only the low result is used.
	SDVTList VTs;
	if (NVT == MVT::i8)
	VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
	else
	VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);

	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);

	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	} else {
	// i16/i32/i64 use an instruction that produces a low and high result even
	// though only the low result is used.
	SDVTList VTs;
	if (NVT == MVT::i8)
	VTs = CurDAG->getVTList(NVT, MVT::i32);
	else
	VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);

	CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
	}

	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned Opc, MOpc;
	bool isSigned = Opcode == ISD::SMUL_LOHI;
	if (!isSigned) {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
	case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
	}
	} else {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
	case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
	}
	}

	unsigned SrcReg, LoReg, HiReg;
	switch (Opc) {
	default: llvm_unreachable("Unknown MUL opcode!");
	case X86::IMUL32r:
	case X86::MUL32r:
	SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
	break;
	case X86::IMUL64r:
	case X86::MUL64r:
	SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	// Multiply is commmutative.
	if (!foldedLoad) {
	foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	if (foldedLoad)
	std::swap(N0, N1);
	}

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
	N0, SDValue()).getValue(1);
	if (foldedLoad) {
	SDValue Chain;
	MachineSDNode *CNode = nullptr;
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
	CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	Chain = SDValue(CNode, 0);
	InFlag = SDValue(CNode, 1);

	// Update the chain.
	ReplaceUses(N1.getValue(1), Chain);
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	} else {
	SDValue Ops[] = { N1, InFlag };
	SDVTList VTs = CurDAG->getVTList(MVT::Glue);
	SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 0);
	}

	// Copy the low half of the result, if it is needed.
	if (!SDValue(Node, 0).use_empty()) {
	assert(LoReg && "Register for low half is not defined!");
	SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
	NVT, InFlag);
	InFlag = ResLo.getValue(2);
	ReplaceUses(SDValue(Node, 0), ResLo);
	LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	// Copy the high half of the result, if it is needed.
	if (!SDValue(Node, 1).use_empty()) {
	assert(HiReg && "Register for high half is not defined!");
	SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
	NVT, InFlag);
	InFlag = ResHi.getValue(2);
	ReplaceUses(SDValue(Node, 1), ResHi);
	LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}

	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned Opc, MOpc;
	bool isSigned = Opcode == ISD::SDIVREM;
	if (!isSigned) {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
	case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
	case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
	case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
	}
	} else {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
	case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
	case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
	case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
	}
	}

	unsigned LoReg, HiReg, ClrReg;
	unsigned SExtOpcode;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8:
	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
	SExtOpcode = 0; // Not used.
	break;
	case MVT::i16:
	LoReg = X86::AX; HiReg = X86::DX;
	ClrReg = X86::DX;
	SExtOpcode = X86::CWD;
	break;
	case MVT::i32:
	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
	SExtOpcode = X86::CDQ;
	break;
	case MVT::i64:
	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
	SExtOpcode = X86::CQO;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	bool signBitIsZero = CurDAG->SignBitIsZero(N0);

	SDValue InFlag;
	if (NVT == MVT::i8) {
	// Special case for div8, just use a move with zero extension to AX to
	// clear the upper 8 bits (AH).
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
	MachineSDNode *Move;
	if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
	: X86::MOVZX16rm8;
	Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
	Chain = SDValue(Move, 1);
	ReplaceUses(N0.getValue(1), Chain);
	// Record the mem-refs
	CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
	} else {
	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
	: X86::MOVZX16rr8;
	Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
	Chain = CurDAG->getEntryNode();
	}
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
	SDValue());
	InFlag = Chain.getValue(1);
	} else {
	InFlag =
	CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
	LoReg, N0, SDValue()).getValue(1);
	if (isSigned && !signBitIsZero) {
	// Sign extend the low part into the high part.
	InFlag =
	SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
	} else {
	// Zero out the high part, effectively zero extending the input.
	SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
	switch (NVT.SimpleTy) {
	case MVT::i16:
	ClrNode =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
	CurDAG->getTargetConstant(X86::sub_16bit, dl,
	MVT::i32)),
	0);
	break;
	case MVT::i32:
	break;
	case MVT::i64:
	ClrNode =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
	CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
	CurDAG->getTargetConstant(X86::sub_32bit, dl,
	MVT::i32)),
	0);
	break;
	default:
	llvm_unreachable("Unexpected division source");
	}

	InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
	ClrNode, InFlag).getValue(1);
	}
	}

	if (foldedLoad) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	MachineSDNode *CNode =
	CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
	InFlag = SDValue(CNode, 1);
	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	} else {
	InFlag =
	SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
	}

	// Prevent use of AH in a REX instruction by explicitly copying it to
	// an ABCD_L register.
	//
	// The current assumption of the register allocator is that isel
	// won't generate explicit references to the GR8_ABCD_H registers. If
	// the allocator and/or the backend get enhanced to be more robust in
	// that regard, this can be, and should be, removed.
	if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
	SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
	unsigned AHExtOpcode =
	isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;

	SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
	MVT::Glue, AHCopy, InFlag);
	SDValue Result(RNode, 0);
	InFlag = SDValue(RNode, 1);

	Result =
	CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);

	ReplaceUses(SDValue(Node, 1), Result);
	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	// Copy the division (low) result, if it is needed.
	if (!SDValue(Node, 0).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	LoReg, NVT, InFlag);
	InFlag = Result.getValue(2);
	ReplaceUses(SDValue(Node, 0), Result);
	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	// Copy the remainder (high) result, if it is needed.
	if (!SDValue(Node, 1).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	HiReg, NVT, InFlag);
	InFlag = Result.getValue(2);
	ReplaceUses(SDValue(Node, 1), Result);
	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case X86ISD::CMP: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	// Optimizations for TEST compares.
	if (!isNullConstant(N1))
	break;

	// Save the original VT of the compare.
	MVT CmpVT = N0.getSimpleValueType();

	// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
	// by a test instruction. The test should be removed later by
	// analyzeCompare if we are using only the zero flag.
	// TODO: Should we check the users and use the BEXTR flags directly?
	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
	unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
	: X86::TEST32rr;
	SDValue BEXTR = SDValue(NewNode, 0);
	NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
	ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	}

	// We can peek through truncates, but we need to be careful below.
	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
	N0 = N0.getOperand(0);

	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
	// use a smaller encoding.
	// Look past the truncate if CMP is the only use of it.
	if (N0.getOpcode() == ISD::AND &&
	N0.getNode()->hasOneUse() &&
	N0.getValueType() != MVT::i8) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C) break;
	uint64_t Mask = C->getZExtValue();

	// Check if we can replace AND+IMM64 with a shift. This is possible for
	// masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
	// flag.
	if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
	onlyUsesZeroFlag(SDValue(Node, 0))) {
	if (isMask_64(~Mask)) {
	unsigned TrailingZeros = countTrailingZeros(Mask);
	SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
	SDValue Shift =
	SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
	N0.getOperand(0), Imm), 0);
	MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
	MVT::i32, Shift, Shift);
	ReplaceNode(Node, Test);
	return;
	}
	if (isMask_64(Mask)) {
	unsigned LeadingZeros = countLeadingZeros(Mask);
	SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
	SDValue Shift =
	SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
	N0.getOperand(0), Imm), 0);
	MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
	MVT::i32, Shift, Shift);
	ReplaceNode(Node, Test);
	return;
	}
	}

	MVT VT;
	int SubRegOp;
	unsigned ROpc, MOpc;

	// For each of these checks we need to be careful if the sign flag is
	// being used. It is only safe to use the sign flag in two conditions,
	// either the sign bit in the shrunken mask is zero or the final test
	// size is equal to the original compare size.

	if (isUInt<8>(Mask) &&
	(!(Mask & 0x80) \|\| CmpVT == MVT::i8 \|\|
	hasNoSignFlagUses(SDValue(Node, 0)))) {
	// For example, convert "testl %eax, $8" to "testb %al, $8"
	VT = MVT::i8;
	SubRegOp = X86::sub_8bit;
	ROpc = X86::TEST8ri;
	MOpc = X86::TEST8mi;
	} else if (OptForMinSize && isUInt<16>(Mask) &&
	(!(Mask & 0x8000) \|\| CmpVT == MVT::i16 \|\|
	hasNoSignFlagUses(SDValue(Node, 0)))) {
	// For example, "testl %eax, $32776" to "testw %ax, $32776".
	// NOTE: We only want to form TESTW instructions if optimizing for
	// min size. Otherwise we only save one byte and possibly get a length
	// changing prefix penalty in the decoders.
	VT = MVT::i16;
	SubRegOp = X86::sub_16bit;
	ROpc = X86::TEST16ri;
	MOpc = X86::TEST16mi;
	} else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
	((!(Mask & 0x80000000) &&
	// Without minsize 16-bit Cmps can get here so we need to
	// be sure we calculate the correct sign flag if needed.
	(CmpVT != MVT::i16 \|\| !(Mask & 0x8000))) \|\|
	CmpVT == MVT::i32 \|\|
	hasNoSignFlagUses(SDValue(Node, 0)))) {
	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
	// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
	// Otherwize, we find ourselves in a position where we have to do
	// promotion. If previous passes did not promote the and, we assume
	// they had a good reason not to and do not promote here.
	VT = MVT::i32;
	SubRegOp = X86::sub_32bit;
	ROpc = X86::TEST32ri;
	MOpc = X86::TEST32mi;
	} else {
	// No eligible transformation was found.
	break;
	}

	SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
	SDValue Reg = N0.getOperand(0);

	// Emit a testl or testw.
	MachineSDNode *NewNode;
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
	if (!LoadN->isSimple()) {
	unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
	if (MOpc == X86::TEST8mi && NumVolBits != 8)
	break;
	else if (MOpc == X86::TEST16mi && NumVolBits != 16)
	break;
	else if (MOpc == X86::TEST32mi && NumVolBits != 32)
	break;
	}
	}
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
	Reg.getOperand(0) };
	NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
	// Update the chain.
	ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(NewNode,
	{cast<LoadSDNode>(Reg)->getMemOperand()});
	} else {
	// Extract the subregister if necessary.
	if (N0.getValueType() != VT)
	Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);

	NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
	}
	// Replace CMP with TEST.
	ReplaceNode(Node, NewNode);
	return;
	}
	break;
	}
	case X86ISD::PCMPISTR: {
	if (!Subtarget->hasSSE42())
	break;

	bool NeedIndex = !SDValue(Node, 0).use_empty();
	bool NeedMask = !SDValue(Node, 1).use_empty();
	// We can't fold a load if we are going to make two instructions.
	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;

	MachineSDNode *CNode;
	if (NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
	ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
	}
	if (NeedIndex \|\| !NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	}

	// Connect the flag usage to the last instruction created.
	ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	case X86ISD::PCMPESTR: {
	if (!Subtarget->hasSSE42())
	break;

	// Copy the two implicit register inputs.
	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
	Node->getOperand(1),
	SDValue()).getValue(1);
	InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
	Node->getOperand(3), InFlag).getValue(1);

	bool NeedIndex = !SDValue(Node, 0).use_empty();
	bool NeedMask = !SDValue(Node, 1).use_empty();
	// We can't fold a load if we are going to make two instructions.
	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;

	MachineSDNode *CNode;
	if (NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
	InFlag);
	ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
	}
	if (NeedIndex \|\| !NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	}
	// Connect the flag usage to the last instruction created.
	ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SETCC: {
	if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
	return;

	break;
	}

	case ISD::STORE:
	if (foldLoadStoreIntoMemOperand(Node))
	return;
	break;
	}

	SelectCode(Node);
	}

	bool X86DAGToDAGISel::
	SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
	std::vector<SDValue> &OutOps) {
	SDValue Op0, Op1, Op2, Op3, Op4;
	switch (ConstraintID) {
	default:
	llvm_unreachable("Unexpected asm memory constraint");
	case InlineAsm::Constraint_o: // offsetable ??
	case InlineAsm::Constraint_v: // not offsetable ??
	case InlineAsm::Constraint_m: // memory
	case InlineAsm::Constraint_X:
	if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
	return true;
	break;
	}

	OutOps.push_back(Op0);
	OutOps.push_back(Op1);
	OutOps.push_back(Op2);
	OutOps.push_back(Op3);
	OutOps.push_back(Op4);
	return false;
	}

	/// This pass converts a legalized DAG into a X86-specific DAG,
	/// ready for instruction scheduling.
	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
	CodeGenOpt::Level OptLevel) {
	return new X86DAGToDAGISel(TM, OptLevel);
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 362609)
	@@ -1,47327 +1,47336 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/Analysis/ProfileSummaryInfo.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc(
	"Sets the preferable loop alignment for experiments (as log2 bytes)"
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	// Added in 10.0.
	static cl::opt<bool> EnableOldKNLABI(
	"x86-enable-old-knl-abi", cl::init(false),
	cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
	"one ZMM register on AVX512F, but not AVX512BW targets."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	static cl::opt<bool> ExperimentalUnorderedISEL(
	"x86-experimental-unordered-atomic-isel", cl::init(false),
	cl::desc("Use LoadSDNode and StoreSDNode instead of "
	"AtomicSDNode for unordered atomic loads and "
	"stores respectively."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVCRT doesn't have powi; fall back to pow
	setLibcallName(RTLIB::POWI_F32, nullptr);
	setLibcallName(RTLIB::POWI_F64, nullptr);
	}

	// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
	// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
	// FIXME: Should we be limitting the atomic size on other configs? Default is
	// 1024.
	if (!Subtarget.hasCmpxchg8b())
	setMaxAtomicSizeInBitsSupported(32);

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	}
	setOperationAction(ISD::ABS , MVT::i64 , Custom);

	// Funnel shifts.
	for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
	setOperationAction(ShiftOp , MVT::i16 , Custom);
	setOperationAction(ShiftOp , MVT::i32 , Custom);
	if (Subtarget.is64Bit())
	setOperationAction(ShiftOp , MVT::i64 , Custom);
	}

	if (!Subtarget.useSoftFloat()) {
	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

	// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
	// SSE has no i16 to fp conversion, only i32. We promote in the handler
	// to allow f80 to use i16 and f64 to use i16 with sse1 only
	setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
	// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

	// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
	setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
	}

	// Handle address space casts between mixed sized pointers.
	setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
	setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FREM , MVT::f128 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	else
	setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (!Subtarget.is64Bit())
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	// Disable f32->f64 extload as we can only generate this in one instruction
	// under optsize. So its easier to pattern match (fpext (load)) for that
	// case instead of needing to emit 2 instructions for extload in the
	// non-optsize case.
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::FSUB, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	} else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 \|\| Is64Bit)) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	if (UseX87)
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	if (UseX87)
	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	if (UseX87)
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	if (UseX87) {
	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}

	// Expand FP32 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f32)) {
	if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	}
	// Expand FP64 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f64)) {
	if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	}
	// Handle constrained floating-point operations of scalar.
	setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// f80 always uses X87.
	if (UseX87) {
	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	setOperationAction(ISD::LROUND, MVT::f80, Expand);
	setOperationAction(ISD::LLROUND, MVT::f80, Expand);
	setOperationAction(ISD::LRINT, MVT::f80, Expand);
	setOperationAction(ISD::LLRINT, MVT::f80, Expand);

	// Handle constrained floating-point operations of scalar.
	setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
	// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
	// as Custom.
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
	}

	// f128 uses xmm registers, but most operations require libcalls.
	if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

	setOperationAction(ISD::FADD, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
	setOperationAction(ISD::FSUB, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
	setOperationAction(ISD::FDIV, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
	setOperationAction(ISD::FMUL, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
	setOperationAction(ISD::FMA, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

	setOperationAction(ISD::FABS, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

	setOperationAction(ISD::FSIN, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
	setOperationAction(ISD::FCOS, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
	setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
	// No STRICT_FSINCOS
	setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
	// We need to custom handle any FP_ROUND with an f128 input, but
	// LegalizeDAG uses the result type to know when to run a custom handler.
	// So we have to list all legal floating point result types here.
	if (isTypeLegal(MVT::f32)) {
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
	}
	if (isTypeLegal(MVT::f64)) {
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
	}
	if (isTypeLegal(MVT::f80)) {
	setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
	}

	setOperationAction(ISD::SETCC, MVT::f128, Custom);

	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);
	setOperationAction(ISD::FPOW , MVT::f128 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

	setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
	setOperationAction(ISD::STORE, MVT::v2f32, Custom);

	setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
	MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::SREM, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::UREM, VT, Custom);
	}

	setOperationAction(ISD::MUL, MVT::v2i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i8, Custom);
	setOperationAction(ISD::MUL, MVT::v8i8, Custom);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
	}

	setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ABS, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

	// Custom legalize these to avoid over promotion or custom promotion.
	for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
	}

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

	// We want to legalize this to an f64 load rather than an i64 load on
	// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
	// store.
	setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
	setOperationAction(ISD::STORE, MVT::v2i32, Custom);
	setOperationAction(ISD::STORE, MVT::v4i16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i8, Custom);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

	// With AVX512, expanding (and promoting the shifts) is better.
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::ROTL, MVT::v16i8, Custom);

	setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::ADD, MVT::i16, Custom);
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::SUB, MVT::i16, Custom);
	setOperationAction(ISD::SUB, MVT::i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

	if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
	// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
	// do the pre and post work in the vector domain.
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
	// We need to mark SINT_TO_FP as Custom even though we want to expand it
	// so that DAG combine doesn't try to turn it into uint_to_fp.
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	// These types need custom splitting if their input is a 128-bit vector.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

	setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

	// With BWI, expanding (and promoting the shifts) is the better.
	if (!Subtarget.hasBWI())
	setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	}
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	setOperationAction(ISD::ABS, MVT::v4i64, Custom);
	setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
	setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

	setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	if (HasInt256) {
	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	// This block controls legalization of the mask vector sizes that are
	// available with AVX512. 512-bit vectors are in a separate block controlled
	// by useAVX512Regs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

	// There is no byte sized k-register load or store without AVX512DQ.
	if (!Subtarget.hasDQI()) {
	setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

	setOperationAction(ISD::STORE, MVT::v1i1, Custom);
	setOperationAction(ISD::STORE, MVT::v2i1, Custom);
	setOperationAction(ISD::STORE, MVT::v4i1, Custom);
	setOperationAction(ISD::STORE, MVT::v8i1, Custom);
	}

	// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// This block controls legalization for 512-bit operations with 32/64 bit
	// elements. 512-bits can be disabled based on prefer-vector-width and
	// required-vector-width function attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
	setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
	}
	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);

	setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	if (!Subtarget.hasVLX()) {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	// Need to custom widen this if we don't have AVX512BW.
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Custom);
	}

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);

	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	if (!Subtarget.hasBWI()) {
	// Need to custom split v32i16/v64i8 bitcasts.
	setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);

	// Better to split these into two 256-bit ops.
	setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
	}

	if (Subtarget.hasVBMI2()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}// has AVX-512

	// This block controls legalization for operations that don't have
	// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
	// narrower widths.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
	setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::UINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_SINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MUL, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	// This block control legalization of v32i1/v64i1 which are available with
	// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
	// useBWIRegs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
	}

	// This block controls legalization for v32i16 and v64i8. 512-bits can be
	// disabled based on prefer-vector-width and required-vector-width function
	// attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::SELECT, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
	setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
	}

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	// TODO: Make these legal even without VLX?
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() &&
	(Subtarget.isTargetWindowsMSVC() \|\| Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::STRICT_FCEIL,
	ISD::FCOS, ISD::STRICT_FCOS,
	ISD::FEXP, ISD::STRICT_FEXP,
	ISD::FFLOOR, ISD::STRICT_FFLOOR,
	ISD::FREM, ISD::STRICT_FREM,
	ISD::FLOG, ISD::STRICT_FLOG,
	ISD::FLOG10, ISD::STRICT_FLOG10,
	ISD::FPOW, ISD::STRICT_FPOW,
	ISD::FSIN, ISD::STRICT_FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
	setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(Align(16));

	verifyIntrinsicTables();

	// Default to having -disable-strictnode-mutation on
	IsStrictFPEnabled = true;
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(MVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return TypeSplitVector;

	if (VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	// v32i1 vectors should be promoted to v32i8 to match avx2.
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return MVT::v32i8;
	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512() &&
	(!isPowerOf2_32(VT.getVectorNumElements()) \|\|
	(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) \|\|
	(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
	return MVT::i8;
	// Split v64i1 vectors if we don't have v64i8 available.
	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	CC != CallingConv::X86_RegCall)
	return MVT::v32i1;
	// FIXME: Should we just make these types legal and custom split operations?
	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !EnableOldKNLABI &&
	Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
	return MVT::v16i32;
	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	// v32i1 vectors should be promoted to v32i8 to match avx2.
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return 1;
	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512() &&
	(!isPowerOf2_32(VT.getVectorNumElements()) \|\|
	(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) \|\|
	(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
	return VT.getVectorNumElements();
	// Split v64i1 vectors if we don't have v64i8 available.
	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	CC != CallingConv::X86_RegCall)
	return 2;
	// FIXME: Should we just make these types legal and custom split operations?
	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !EnableOldKNLABI &&
	Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
	return 1;
	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const {
	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512() &&
	(!isPowerOf2_32(VT.getVectorNumElements()) \|\|
	(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) \|\|
	(VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
	RegisterVT = MVT::i8;
	IntermediateVT = MVT::i1;
	NumIntermediates = VT.getVectorNumElements();
	return NumIntermediates;
	}

	// Split v64i1 vectors if we don't have v64i8 available.
	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	CC != CallingConv::X86_RegCall) {
	RegisterVT = MVT::v32i1;
	IntermediateVT = MVT::v32i1;
	NumIntermediates = 2;
	return 2;
	}

	return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	/// For vector ops we check that the overall size isn't larger than our
	/// preferred vector width.
	EVT X86TargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 64-byte accesses are slow.
	if (Size >= 64 && Subtarget.hasAVX512() &&
	(Subtarget.getPreferVectorWidth() >= 512)) {
	return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
	}
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX() &&
	(Subtarget.getPreferVectorWidth() >= 256)) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	// If we have SSE1 registers we should be able to use them.
	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
	(Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// NonTemporal vector memory ops must be aligned.
	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
	// NT loads can only be vector aligned, so if its less aligned than the
	// minimum vector size (which we can split the vector down to), we might as
	// well use a regular unaligned vector load.
	// We don't have any NT loads pre-SSE41.
	if (!!(Flags & MachineMemOperand::MOLoad))
	return (Align < 16 \|\| !Subtarget.hasSSE41());
	return false;
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isIntOrPtrTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::X86_FastCall);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getGlobalVariable("__security_cookie");
	}
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getFunction("__security_check_cookie");
	}
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	const TargetMachine &TM = getTargetMachine();
	if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
	return false;

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	}

	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	}

	return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
	SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// Report an error if we have attempted to return a value via an XMM
	// register and SSE was disabled.
	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (!Subtarget.hasSSE2() &&
	X86::FR64XRegClass.contains(VA.getLocReg()) &&
	ValVT == MVT::f64) {
	// When returning a double via an XMM register, report an error if SSE2 is
	// not enabled.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers.
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type.
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type.
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together.
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// Report an error if there was an attempt to return FP values via XMM
	// registers.
	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	if (VA.getLocReg() == X86::XMM1)
	VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
	else
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (!Subtarget.hasSSE2() &&
	X86::FR64XRegClass.contains(VA.getLocReg()) &&
	CopyVT == MVT::f64) {
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	if (VA.getLocReg() == X86::XMM1)
	VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
	else
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	if (VA.getLocInfo() == CCValAssign::BCvt)
	Val = DAG.getBitcast(VA.getValVT(), Val);

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM \|\| CC == CallingConv::Tail);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	// Swift:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) \|\| CC == CallingConv::Tail;
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	if (!CI->isTailCall())
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

	// FIXME: For now, all byval parameter objects are marked as aliasing. This
	// can be improved with deeper analysis.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
	/isAliased=/true);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	// If the argument is passed directly in memory without any extension, then we
	// can perform copy elision. Large vector types, for example, may be passed
	// indirectly by pointer.
	if (Flags.isCopyElisionCandidate() &&
	VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/IsImmutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i8)
	RC = &X86::GR8RegClass;
	else if (RegVT == MVT::i16)
	RC = &X86::GR16RegClass;
	else if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::VR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.useAVX512Regs() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Forward AL for SysV x86_64 targets, since it is used for varargs.
	if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &FR : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
	FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
	Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt \|\|
	CallConv == CallingConv::Tail;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
	const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
	bool HasNoCfCheck =
	(CI && CI->doesNoCfCheck()) \|\| (II && II->doesNoCfCheck());
	const Module *M = MF.getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

	MachineFunction::CallSiteInfo CSInfo;

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!IsGuaranteeTCO && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall && !IsMustTail)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	if (isByVal) {
	// Memcpy the argument to a temporary stack slot to prevent
	// the caller from seeing any modifications the callee may make
	// as guaranteed by the `byval` attribute.
	int FrameIdx = MF.getFrameInfo().CreateStackObject(
	Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
	false);
	SDValue StackSlot =
	DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
	Chain =
	CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
	// From now on treat this as a regular pointer
	Arg = StackSlot;
	isByVal = false;
	} else {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	}
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EnableDebugEntryValues)
	CSInfo.emplace_back(VA.getLocReg(), I);
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress \|\|
	Callee->getOpcode() == ISD::ExternalSymbol) {
	// Lower direct calls to global addresses and external symbols. Setting
	// ForCall to true here has the effect of removing WrapperRIP when possible
	// to allow direct calls to be selected without first materializing the
	// address into a register.
	Callee = LowerGlobalOrExternal(Callee, DAG, /ForCall=/true);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall && !IsMustTail) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
	memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	if (HasNoCfCheck && IsCFProtectionSupported) {
	Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
	} else {
	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	}
	InFlag = Chain.getValue(1);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	// Save heapallocsite metadata.
	if (CLI.CS)
	if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
	DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
	SelectionDAG &DAG) const {
	const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
	const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
	assert(StackSize % SlotSize == 0 &&
	"StackSize must be a multiple of SlotSize");
	return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!Register::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
	bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt \|\|
	CalleeCC == CallingConv::Tail;

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (IsGuaranteeTCO) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	Register Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::OR:
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// Return true if the condition is an signed comparison operation.
	static bool isX86CCSigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return false;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return true;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
	// X >= 0 -> X == 0, jump on !sign.
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = Align::None();
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case GATHER:
	case GATHER_AVX2: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = Align::None();
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case SCATTER: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = Align::None();
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");

	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

	// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
	// those uses are extracted directly into a store, then the extract + store
	// can be store-folded. Therefore, it's probably not worth splitting the load.
	EVT VT = Load->getValueType(0);
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) && !Load->hasOneUse()) {
	for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
	// Skip uses of the chain value. Result 0 of the node is the load value.
	if (UI.getUse().getResNo() != 0)
	continue;

	// If this use is not an extract + store, it's probably worth splitting.
	if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\| !UI->hasOneUse() \|\|
	UI->use_begin()->getOpcode() != ISD::STORE)
	return true;
	}
	// All non-chain uses are extract + store.
	return false;
	}

	return true;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
	// If we are using XMM registers in the ABI and the condition of the select is
	// a floating-point compare and we have blendv or conditional move, then it is
	// cheaper to select instead of doing a cross-register move and creating a
	// load that depends on the compare result.
	bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
	return !IsFPSetCC \|\| !Subtarget.isTarget64BitLP64() \|\| !Subtarget.hasAVX();
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
	SDValue C) const {
	// TODO: We handle scalars using custom code, but generic combining could make
	// that unnecessary.
	APInt MulC;
	if (!ISD::isConstantSplatVector(C.getNode(), MulC))
	return false;

	// Find the type this will be legalized too. Otherwise we might prematurely
	// convert this to shl+add/sub and then still have to type legalize those ops.
	// Another choice would be to defer the decision for illegal types until
	// after type legalization. But constant splat vectors of i64 can't make it
	// through type legalization on 32-bit targets so we would need to special
	// case vXi64.
	while (getTypeAction(Context, VT) != TypeLegal)
	VT = getTypeToTransformTo(Context, VT);

	// If vector multiply is legal, assume that's faster than shl + add/sub.
	// TODO: Multiply is a complex op with higher latency and lower throughput in
	// most implementations, so this check could be loosened based on type
	// and/or a CPU attribute.
	if (isOperationLegal(ISD::MUL, VT))
	return false;

	// shl+add, shl+sub, shl+add+neg
	return (MulC + 1).isPowerOf2() \|\| (MulC - 1).isPowerOf2() \|\|
	(1 - MulC).isPowerOf2() \|\| (-(MulC + 1)).isPowerOf2();
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
	unsigned Opc = VecOp.getOpcode();

	// Assume target opcodes can't be scalarized.
	// TODO - do we have any exceptions?
	if (Opc >= ISD::BUILTIN_OP_END)
	return false;

	// If the vector op is not supported, try to convert to scalar.
	EVT VecVT = VecOp.getValueType();
	if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
	return true;

	// If the vector op is supported, but the scalar op is not, the transform may
	// not be worthwhile.
	EVT ScalarVT = VecVT.getScalarType();
	return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
	}

	bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
	// TODO: Allow vectors?
	if (VT.isVector())
	return false;
	return VT.isSimple() \|\| !isOperationExpand(Opcode, VT);
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
	BitcastVT.getVectorElementType() == MVT::i1)
	return false;

	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
	return false;

	// If both types are legal vectors, it's always ok to convert them.
	if (LoadVT.isVector() && BitcastVT.isVector() &&
	isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
	return true;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	// Make sure we don't merge greater than our preferred vector
	// width.
	if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
	return false;
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (VT.isVector())
	return false;

	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return !isa<ConstantSDNode>(Y);
	}

	bool X86TargetLowering::hasAndNot(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (!VT.isVector())
	return hasAndNotCompare(Y);

	// Vector.

	if (!Subtarget.hasSSE1() \|\| VT.getSizeInBits() < 128)
	return false;

	if (VT == MVT::v4i32)
	return true;

	return Subtarget.hasSSE2();
	}

	bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
	return X.getValueType().isScalarInteger(); // 'bt'
	}

	bool X86TargetLowering::
	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
	SelectionDAG &DAG) const {
	// Does baseline recommend not to perform the fold by default?
	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
	return false;
	// For scalars this transform is always beneficial.
	if (X.getValueType().isScalarInteger())
	return true;
	// If all the shift amounts are identical, then transform is beneficial even
	// with rudimentary SSE2 shifts.
	if (DAG.isSplatValue(Y, /AllowUndefs=/true))
	return true;
	// If we have AVX2 with it's powerful shift operations, then it's also good.
	if (Subtarget.hasAVX2())
	return true;
	// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
	return NewShiftOpcode == ISD::SHL;
	}

	bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
	const SDNode *N, CombineLevel Level) const {
	assert(((N->getOpcode() == ISD::SHL &&
	N->getOperand(0).getOpcode() == ISD::SRL) \|\|
	(N->getOpcode() == ISD::SRL &&
	N->getOperand(0).getOpcode() == ISD::SHL)) &&
	"Expected shift-shift mask");
	EVT VT = N->getValueType(0);
	if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) \|\|
	(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
	// Only fold if the shift values are equal - so it folds to AND.
	// TODO - we should fold if either is a non-uniform vector but we don't do
	// the fold for non-splats yet.
	return N->getOperand(1) == N->getOperand(0).getOperand(1);
	}
	return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
	}

	bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
	EVT VT = Y.getValueType();

	// For vectors, we don't have a preference, but we probably want a mask.
	if (VT.isVector())
	return false;

	// 64-bit shifts on 32-bit targets produce really bad bloated code.
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	return false;

	return true;
	}

	bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	SDNode *N) const {
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	!Subtarget.isOSWindows())
	return false;
	return true;
	}

	bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
	// Any legal vector type can be splatted more efficiently than
	// loading/spilling from memory.
	return isTypeLegal(VT);
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning from position Pos and ending
	/// in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	return llvm::all_of(Mask.slice(Pos, Size),
	[](int M) { return M == SM_SentinelUndef; });
	}

	/// Return true if the mask creates a vector whose lower half is undefined.
	static bool isUndefLowerHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, 0, NumElts / 2);
	}

	/// Return true if the mask creates a vector whose upper half is undefined.
	static bool isUndefUpperHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
	}

	/// Return true if Val falls within the specified range (L, H].
	static bool isInRange(int Val, int Low, int Hi) {
	return (Val >= Low && Val < Hi);
	}

	/// Return true if the value of any element in Mask falls within the specified
	/// range (L, H].
	static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::all_of(
	Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::all_of(
	Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos + Size, falls within the specified
	/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low, int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low,
	int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	return llvm::all_of(Mask.slice(Pos, Size),
	[](int M) { return isUndefOrZero(M); });
	}

	/// Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool V2IsZero,
	SmallVectorImpl<int> &WidenedMask) {
	// Create an alternative mask with info about zeroable elements.
	// Here we do not set undef elements as zeroable.
	SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
	if (V2IsZero) {
	assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
	for (int i = 0, Size = Mask.size(); i != Size; ++i)
	if (Mask[i] != SM_SentinelUndef && Zeroable[i])
	ZeroableMask[i] = SM_SentinelZero;
	}
	return canWidenShuffleElements(ZeroableMask, WidenedMask);
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask) {
	SmallVector<int, 32> WidenedMask;
	return canWidenShuffleElements(Mask, WidenedMask);
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.isFloatingPoint()) {
	Vec = DAG.getConstantFP(+0.0, dl, VT);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
	Vec.getValueType().getScalarType() == VT.getScalarType() &&
	"Unsupported vector widening type");
	SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getUNDEF(VT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl, unsigned WideSizeInBits) {
	assert(Vec.getValueSizeInBits() < WideSizeInBits &&
	(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
	"Unsupported vector widening type");
	unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
	MVT SVT = Vec.getSimpleValueType().getScalarType();
	MVT VT = MVT::getVectorVT(SVT, WideNumElts);
	return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
	}

	// Helper function to collect subvector ops that are concated together,
	// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
	// The subvectors in Ops are guaranteed to be the same type.
	static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
	assert(Ops.empty() && "Expected an empty ops vector");

	if (N->getOpcode() == ISD::CONCAT_VECTORS) {
	Ops.append(N->op_begin(), N->op_end());
	return true;
	}

	if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	SDValue Src = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	const APInt &Idx = N->getConstantOperandAPInt(2);
	EVT VT = Src.getValueType();
	EVT SubVT = Sub.getValueType();

	// TODO - Handle more general insert_subvector chains.
	if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
	Idx == (VT.getVectorNumElements() / 2) &&
	Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(1).getValueType() == SubVT &&
	isNullConstant(Src.getOperand(2))) {
	Ops.push_back(Src.getOperand(1));
	Ops.push_back(Sub);
	return true;
	}
	}

	return false;
	}

	// Helper for splitting operands of an operation to legal target size and
	// apply a function on each part.
	// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
	// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
	// deciding if/how to split Ops. Ops elements do not have to be of type VT.
	// The argument Builder is a function that will be applied on each split part:
	// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
	template <typename F>
	SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
	const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
	F Builder, bool CheckBWI = true) {
	assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
	unsigned NumSubs = 1;
	if ((CheckBWI && Subtarget.useBWIRegs()) \|\|
	(!CheckBWI && Subtarget.useAVX512Regs())) {
	if (VT.getSizeInBits() > 512) {
	NumSubs = VT.getSizeInBits() / 512;
	assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
	}
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256) {
	NumSubs = VT.getSizeInBits() / 256;
	assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
	}
	} else {
	if (VT.getSizeInBits() > 128) {
	NumSubs = VT.getSizeInBits() / 128;
	assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
	}
	}

	if (NumSubs == 1)
	return Builder(DAG, DL, Ops);

	SmallVector<SDValue, 4> Subs;
	for (unsigned i = 0; i != NumSubs; ++i) {
	SmallVector<SDValue, 2> SubOps;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
	unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
	SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
	}
	Subs.push_back(Builder(DAG, DL, SubOps));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to opimitize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

	// Do an optimization for the the most frequently used types.
	if (WideOpVT != MVT::v64i1 \|\| Subtarget.is64Bit()) {
	APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
	Mask0.flipAllBits();
	SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
	SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
	Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Clear the upper bits of the subvector and move it to its insert position.
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

	// Isolate the bits below the insertion point.
	unsigned LowShift = NumElems - IdxVal;
	SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
	DAG.getTargetConstant(LowShift, dl, MVT::i8));
	Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
	DAG.getTargetConstant(LowShift, dl, MVT::i8));

	// Isolate the bits after the last inserted bit.
	unsigned HighShift = IdxVal + SubVecNumElems;
	SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getTargetConstant(HighShift, dl, MVT::i8));
	High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
	DAG.getTargetConstant(HighShift, dl, MVT::i8));

	// Now OR all 3 pieces together.
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
	SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
	EVT SubVT = V1.getValueType();
	EVT SubSVT = SubVT.getScalarType();
	unsigned SubNumElts = SubVT.getVectorNumElements();
	unsigned SubVectorWidth = SubVT.getSizeInBits();
	EVT VT = EVT::getVectorVT(DAG.getContext(), SubSVT, 2 SubNumElts);
	SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
	return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	// Convert _EXTEND to _EXTEND_VECTOR_INREG opcode.
	static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return ISD::ANY_EXTEND_VECTOR_INREG;
	case ISD::ZERO_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return ISD::ZERO_EXTEND_VECTOR_INREG;
	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return ISD::SIGN_EXTEND_VECTOR_INREG;
	}
	llvm_unreachable("Unknown opcode");
	}

	static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue In, SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
	assert((ISD::ANY_EXTEND == Opcode \|\| ISD::SIGN_EXTEND == Opcode \|\|
	ISD::ZERO_EXTEND == Opcode) &&
	"Unknown extension opcode");

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (InVT.getSizeInBits() > 128) {
	assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
	"Expected VTs to be the same size!");
	unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
	InVT = In.getValueType();
	}

	if (VT.getVectorNumElements() != InVT.getVectorNumElements())
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

	return DAG.getNode(Opcode, DL, VT, In);
	}

	// Match (xor X, -1) -> X.
	// Match extract_subvector(xor X, -1) -> extract_subvector(X).
	// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
	static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
	V = peekThroughBitcasts(V);
	if (V.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
	return V.getOperand(0);
	if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	(isNullConstant(V.getOperand(1)) \|\| V.getOperand(0).hasOneUse())) {
	if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
	Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
	Not, V.getOperand(1));
	}
	}
	SmallVector<SDValue, 2> CatOps;
	if (collectConcatOps(V.getNode(), CatOps)) {
	for (SDValue &CatOp : CatOps) {
	SDValue NotCat = IsNOT(CatOp, DAG);
	if (!NotCat) return SDValue();
	CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
	}
	return SDValue();
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static const Constant getTargetConstantFromNode(LoadSDNode Load) {
	if (!Load \|\| !ISD::isNormalLoad(Load))
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry() \|\| CNode->getOffset() != 0)
	return nullptr;

	return CNode->getConstVal();
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);
	return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
	}

	const Constant *
	X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
	assert(LD && "Unexpected null LoadSDNode");
	return getTargetConstantFromNode(LD);
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SmallVector<APInt, 64> SrcEltBits(1, RawBits);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantFPSDNode>(Src);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
	if (!CstTy->isVectorTy() \|\| (CstSizeInBits % SizeInBits) != 0)
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
	if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
	return false;

	SDValue Ptr = MemIntr->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry() \|\|
	CNode->getOffset() != 0)
	return false;

	if (const Constant *C = CNode->getConstVal()) {
	unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract constant bits from a subvector broadcast.
	if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
	SmallVector<APInt, 16> SubEltBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, SubEltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	UndefElts = APInt::getSplat(NumElts, UndefElts);
	while (EltBits.size() < NumElts)
	EltBits.append(SubEltBits.begin(), SubEltBits.end());
	return true;
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Insert constant bits from a base and sub vector sources.
	if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(Op.getOperand(2))) {
	// TODO - support insert_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	APInt UndefSubElts;
	SmallVector<APInt, 32> EltSubBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefSubElts, EltSubBits,
	AllowWholeUndefs, AllowPartialUndefs) &&
	getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	unsigned BaseIdx = Op.getConstantOperandVal(2);
	UndefElts.insertBits(UndefSubElts, BaseIdx);
	for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
	EltBits[BaseIdx + i] = EltSubBits[i];
	return true;
	}
	}

	// Extract constant bits from a subvector's source.
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(Op.getOperand(1))) {
	// TODO - support extract_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	EVT SrcVT = Op.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = VT.getVectorNumElements();
	unsigned BaseIdx = Op.getConstantOperandVal(1);
	UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
	if ((BaseIdx + NumSubElts) != NumSrcElts)
	EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
	if (BaseIdx != 0)
	EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
	return true;
	}
	}

	// Extract constant bits from shuffle node sources.
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
	// TODO - support shuffle through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	ArrayRef<int> Mask = SVN->getMask();
	if ((!AllowWholeUndefs \|\| !AllowPartialUndefs) &&
	llvm::any_of(Mask, [](int M) { return M < 0; }))
	return false;

	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (isAnyInRange(Mask, 0, NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts0, EltBits0, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;
	if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefElts1, EltBits1, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;

	UndefElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];
	if (M < 0) {
	UndefElts.setBit(i);
	EltBits.push_back(APInt::getNullValue(EltSizeInBits));
	} else if (M < (int)NumElts) {
	if (UndefElts0[M])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits0[M]);
	} else {
	if (UndefElts1[M - NumElts])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits1[M - NumElts]);
	}
	}
	return true;
	}

	return false;
	}

	namespace llvm {
	namespace X86 {
	bool isConstantSplat(SDValue Op, APInt &SplatVal) {
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
	UndefElts, EltBits, true, false)) {
	int SplatIndex = -1;
	for (int i = 0, e = EltBits.size(); i != e; ++i) {
	if (UndefElts[i])
	continue;
	if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
	SplatIndex = -1;
	break;
	}
	SplatIndex = i;
	}
	if (0 <= SplatIndex) {
	SplatVal = EltBits[SplatIndex];
	return true;
	}
	}

	return false;
	}
	} // namespace X86
	} // namespace llvm

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask,
	APInt &UndefElts) {
	// Extract the raw target constant bits.
	SmallVector<APInt, 64> EltBits;
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}

	// Split the demanded elts of a PACKSS/PACKUS node between its operands.
	static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumInnerElts = NumElts / 2;
	int NumEltsPerLane = NumElts / NumLanes;
	int NumInnerEltsPerLane = NumInnerElts / NumLanes;

	DemandedLHS = APInt::getNullValue(NumInnerElts);
	DemandedRHS = APInt::getNullValue(NumInnerElts);

	// Map DemandedElts to the packed operands.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
	int OuterIdx = (Lane * NumEltsPerLane) + Elt;
	int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
	if (DemandedElts[OuterIdx])
	DemandedLHS.setBit(InnerIdx);
	if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
	DemandedRHS.setBit(InnerIdx);
	}
	}
	}

	// Split the demanded elts of a HADD/HSUB node between its operands.
	static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumEltsPerLane = NumElts / NumLanes;
	int HalfEltsPerLane = NumEltsPerLane / 2;

	DemandedLHS = APInt::getNullValue(NumElts);
	DemandedRHS = APInt::getNullValue(NumElts);

	// Map DemandedElts to the horizontal operands.
	for (int Idx = 0; Idx != NumElts; ++Idx) {
	if (!DemandedElts[Idx])
	continue;
	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
	int LocalIdx = Idx % NumEltsPerLane;
	if (LocalIdx < HalfEltsPerLane) {
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	} else {
	LocalIdx -= HalfEltsPerLane;
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	}
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	APInt RawUndefs;
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch (N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeSHUFPMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(NumElems, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodePSHUFBMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUF128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
	Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodeVPPERMMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMVMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static void computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2,
	APInt &KnownUndef, APInt &KnownZero) {
	int Size = Mask.size();
	KnownUndef = KnownZero = APInt::getNullValue(Size);

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Size;
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0; i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0) {
	KnownUndef.setBit(i);
	continue;
	}
	if ((M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	KnownZero.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef())
	KnownUndef.setBit(i);
	if (X86::isZeroNode(Op))
	KnownZero.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
	if (Val == 0)
	KnownZero.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
	if (Val == 0)
	KnownZero.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllUndef = true;
	bool AllZero = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllUndef &= Op.isUndef();
	AllZero &= X86::isZeroNode(Op);
	}
	if (AllUndef)
	KnownUndef.setBit(i);
	if (AllZero)
	KnownZero.setBit(i);
	continue;
	}
	}
	}

	/// Decode a target shuffle mask and inputs and see if any values are
	/// known to be undef or zero from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	/// FIXME: Merge this with computeZeroableShuffleElements?
	static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	APInt &KnownUndef, APInt &KnownZero) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	int Size = Mask.size();
	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];
	KnownUndef = KnownZero = APInt::getNullValue(Size);

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Size) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Size;

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0; i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0) {
	assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
	if (SM_SentinelUndef == M)
	KnownUndef.setBit(i);
	if (SM_SentinelZero == M)
	KnownZero.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	KnownUndef.setBit(i);
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	KnownUndef.setBit(i);
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	KnownZero.setBit(i);
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	KnownUndef.setBit(i);
	else if (SrcEltBits[SrcIdx][M] == 0)
	KnownZero.setBit(i);
	}
	}

	assert(VT.getVectorNumElements() == (unsigned)Size &&
	"Different mask size from vector size!");
	return true;
	}

	// Replace target shuffle mask elements with known undef/zero sentinels.
	static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
	const APInt &KnownUndef,
	const APInt &KnownZero,
	bool ResolveKnownZeros= true) {
	unsigned NumElts = Mask.size();
	assert(KnownUndef.getBitWidth() == NumElts &&
	KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");

	for (unsigned i = 0; i != NumElts; ++i) {
	if (KnownUndef[i])
	Mask[i] = SM_SentinelUndef;
	else if (ResolveKnownZeros && KnownZero[i])
	Mask[i] = SM_SentinelZero;
	}
	}

	// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
	static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
	APInt &KnownUndef,
	APInt &KnownZero) {
	unsigned NumElts = Mask.size();
	KnownUndef = KnownZero = APInt::getNullValue(NumElts);

	for (unsigned i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (SM_SentinelUndef == M)
	KnownUndef.setBit(i);
	if (SM_SentinelZero == M)
	KnownZero.setBit(i);
	}
	}

	// Forward declaration (for getFauxShuffleMask recursive check).
	// TODO: Use DemandedElts variant.
	static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts);

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	if ((NumBitsPerElt % 8) != 0 \|\| (NumSizeInBits % 8) != 0)
	return false;
	assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::VECTOR_SHUFFLE: {
	// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
	if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
	Mask.append(ShuffleMask.begin(), ShuffleMask.end());
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	return false;
	}
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	const APInt &ByteBits = EltBits[i];
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::OR: {
	// Inspect each operand at the byte level. We can merge these into a
	// blend shuffle mask if for each byte at least one is masked out (zero).
	KnownBits Known0 =
	DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
	KnownBits Known1 =
	DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
	if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
	bool IsByteMask = true;
	unsigned NumSizeInBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
	APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
	for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
	unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
	unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
	if (LHS == 255 && RHS == 0)
	SelectMask.setBit(i);
	else if (LHS == 255 && RHS == 255)
	ZeroMask.setBit(i);
	else if (!(LHS == 0 && RHS == 255))
	IsByteMask = false;
	}
	if (IsByteMask) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
	for (unsigned j = 0; j != NumBytesPerElt; ++j) {
	unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
	int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
	Mask.push_back(Idx);
	}
	}
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	}

	// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
	// is a valid shuffle index.
	SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
	SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
	if (!N0.getValueType().isVector() \|\| !N1.getValueType().isVector())
	return false;
	SmallVector<int, 64> SrcMask0, SrcMask1;
	SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
	if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
	true) \|\|
	!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
	true))
	return false;
	size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
	SmallVector<int, 64> Mask0, Mask1;
	scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
	scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
	for (size_t i = 0; i != MaskSize; ++i) {
	if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
	Mask.push_back(SM_SentinelUndef);
	else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
	Mask.push_back(SM_SentinelZero);
	else if (Mask1[i] == SM_SentinelZero)
	Mask.push_back(Mask0[i]);
	else if (Mask0[i] == SM_SentinelZero)
	Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
	else
	return false;
	}
	Ops.append(SrcInputs0.begin(), SrcInputs0.end());
	Ops.append(SrcInputs1.begin(), SrcInputs1.end());
	return true;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Src = N.getOperand(0);
	SDValue Sub = N.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	if (!isa<ConstantSDNode>(N.getOperand(2)) \|\|
	!N->isOnlyUserOf(Sub.getNode()))
	return false;
	uint64_t InsertIdx = N.getConstantOperandVal(2);
	// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0).getValueType() == VT &&
	isa<ConstantSDNode>(Sub.getOperand(1))) {
	uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i)
	Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
	Ops.push_back(Src);
	Ops.push_back(Sub.getOperand(0));
	return true;
	}
	// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
	SmallVector<int, 64> SubMask;
	SmallVector<SDValue, 2> SubInputs;
	if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
	SubMask, DAG, Depth + 1, ResolveKnownElts))
	return false;
	if (SubMask.size() != NumSubElts) {
	assert(((SubMask.size() % NumSubElts) == 0 \|\|
	(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
	if ((NumSubElts % SubMask.size()) == 0) {
	int Scale = NumSubElts / SubMask.size();
	SmallVector<int,64> ScaledSubMask;
	scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
	SubMask = ScaledSubMask;
	} else {
	int Scale = SubMask.size() / NumSubElts;
	NumSubElts = SubMask.size();
	NumElts *= Scale;
	InsertIdx *= Scale;
	}
	}
	Ops.push_back(Src);
	for (SDValue &SubInput : SubInputs) {
	EVT SubSVT = SubInput.getValueType().getScalarType();
	EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
	NumSizeInBits / SubSVT.getSizeInBits());
	Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
	DAG.getUNDEF(AltVT), SubInput,
	DAG.getIntPtrConstant(0, SDLoc(N))));
	}
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i) {
	int M = SubMask[i];
	if (0 <= M) {
	int InputIdx = M / NumSubElts;
	M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
	}
	Mask[i + InsertIdx] = M;
	}
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) \|\|
	(N0.getOpcode() == X86ISD::PEXTRW &&
	N0.getOperand(0).getValueType() == MVT::v8i16) \|\|
	(N0.getOpcode() == X86ISD::PEXTRB &&
	N0.getOperand(0).getValueType() == MVT::v16i8)) {
	SrcExtract = N0;
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	SDValue InIndex = N.getOperand(2);
	if (!isa<ConstantSDNode>(InIndex) \|\|
	cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
	return false;
	uint64_t InIdx = N.getConstantOperandVal(2);

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(PEXTR) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0);
	SDValue ExIndex = InScl.getOperand(1);
	if (!isa<ConstantSDNode>(ExIndex) \|\|
	cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
	return false;
	uint64_t ExIdx = InScl.getConstantOperandVal(1);

	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	APInt EltsLHS, EltsRHS;
	getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() &&
	DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) \|\|
	(!N1.isUndef() &&
	DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() &&
	!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) \|\|
	(!N1.isUndef() &&
	!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;

	if (NumSizeInBits != SrcVT.getSizeInBits()) {
	assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
	"Illegal broadcast type");
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumSizeInBits / SrcVT.getScalarSizeInBits());
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
	DAG.getUNDEF(SrcVT), Src,
	DAG.getIntPtrConstant(0, SDLoc(N)));
	}

	Ops.push_back(Src);
	Mask.append(NumElts, 0);
	return true;
	}
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Extended source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;

	unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
	bool IsAnyExtend =
	(ISD::ANY_EXTEND == Opcode \|\| ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
	DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
	Mask);

	if (NumSizeInBits != SrcVT.getSizeInBits()) {
	assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
	"Illegal zero-extension type");
	SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
	NumSizeInBits / NumSrcBitsPerElt);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
	DAG.getUNDEF(SrcVT), Src,
	DAG.getIntPtrConstant(0, SDLoc(N)));
	}

	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	continue;
	}

	// Check for repeated inputs.
	bool IsRepeat = false;
	for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
	if (UsedInputs[j] != Inputs[i])
	continue;
	for (int &M : Mask)
	if (lo <= M)
	M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
	IsRepeat = true;
	break;
	}
	if (IsRepeat)
	continue;

	UsedInputs.push_back(Inputs[i]);
	}
	Inputs = UsedInputs;
	}

	/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
	/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
	/// Returns true if the target shuffle mask was decoded.
	static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	APInt &KnownUndef, APInt &KnownZero,
	SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts) {
	EVT VT = Op.getValueType();
	if (!VT.isSimple() \|\| !VT.isVector())
	return false;

	if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
	if (ResolveKnownElts)
	resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
	return true;
	}
	if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
	ResolveKnownElts)) {
	resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
	return true;
	}
	return false;
	}

	static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG, unsigned Depth = 0,
	bool ResolveKnownElts = true) {
	EVT VT = Op.getValueType();
	if (!VT.isSimple() \|\| !VT.isVector())
	return false;

	APInt KnownUndef, KnownZero;
	unsigned NumElts = Op.getValueType().getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
	KnownZero, DAG, Depth, ResolveKnownElts);
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Recurse into insert_subvector base/sub vector to find scalars.
	if (Opcode == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	SDValue Vec = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	uint64_t SubIdx = N->getConstantOperandVal(2);

	if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
	return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
	return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
	}

	// Recurse into extract_subvector src vector to find scalars.
	if (Opcode == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	SDValue Src = N->getOperand(0);
	uint64_t SrcIdx = N->getConstantOperandVal(1);
	return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; i += 2) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
	if (!ThisIsNonZero && !NextIsNonZero)
	continue;

	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue Elt;
	if (ThisIsNonZero) {
	if (NumZero \|\| NextIsNonZero)
	Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	else
	Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	}

	if (NextIsNonZero) {
	SDValue NextElt = Op.getOperand(i + 1);
	if (i == 0 && NumZero)
	NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
	else
	NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
	NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (ThisIsNonZero)
	Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
	else
	Elt = NextElt;
	}

	// If our first insertion is not the first index then insert into zero
	// vector to break any register dependency else use SCALAR_TO_VECTOR.
	if (!V) {
	if (i != 0)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
	DAG.getIntPtrConstant(i / 2, dl));
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If this is a splat of a pair of elements, use MOVDDUP (unless the target
	// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
	// Because we're creating a less complicated build vector here, we may enable
	// further folding of the MOVDDUP via shuffle transforms.
	if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
	Op.getOperand(0) == Op.getOperand(2) &&
	Op.getOperand(1) == Op.getOperand(3) &&
	Op.getOperand(0) != Op.getOperand(1)) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	// Create a new build vector with the first 2 elements followed by undef
	// padding, bitcast to v2f64, duplicate, and bitcast back.
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
	SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
	return DAG.getBitcast(VT, Dup);
	}

	// Find all zeroable elements.
	std::bitset<4> Zeroable, Undefs;
	for (int i = 0; i < 4; ++i) {
	SDValue Elt = Op.getOperand(i);
	Undefs[i] = Elt.isUndef();
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i = 0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op.getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZeroOrUndef = (Zeroable == Undefs)
	? DAG.getUNDEF(VT)
	: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL, true));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| !LD->isSimple())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
	static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
	if (ISD::isNON_EXTLoad(Elt.getNode())) {
	auto *BaseLd = cast<LoadSDNode>(Elt);
	if (!BaseLd->isSimple())
	return false;
	Ld = BaseLd;
	ByteOffset = 0;
	return true;
	}

	switch (Elt.getOpcode()) {
	case ISD::BITCAST:
	case ISD::TRUNCATE:
	case ISD::SCALAR_TO_VECTOR:
	return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
	case ISD::SRL:
	if (isa<ConstantSDNode>(Elt.getOperand(1))) {
	uint64_t Idx = Elt.getConstantOperandVal(1);
	if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
	ByteOffset += Idx / 8;
	return true;
	}
	}
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	if (isa<ConstantSDNode>(Elt.getOperand(1))) {
	SDValue Src = Elt.getOperand(0);
	unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
	unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
	if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
	findEltLoadSrc(Src, Ld, ByteOffset)) {
	uint64_t Idx = Elt.getConstantOperandVal(1);
	ByteOffset += Idx * (SrcSizeInBits / 8);
	return true;
	}
	}
	break;
	}

	return false;
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	if ((VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	APInt LoadMask = APInt::getNullValue(NumElems);
	APInt ZeroMask = APInt::getNullValue(NumElems);
	APInt UndefMask = APInt::getNullValue(NumElems);

	SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
	SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();
	if (Elt.isUndef()) {
	UndefMask.setBit(i);
	continue;
	}
	if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode())) {
	ZeroMask.setBit(i);
	continue;
	}

	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	unsigned EltSizeInBits = Elt.getValueSizeInBits();
	if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
	return SDValue();

	if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) \|\| ByteOffsets[i] < 0)
	return SDValue();
	unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
	if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
	return SDValue();

	LoadMask.setBit(i);
	LastLoadedElt = i;
	}
	assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
	LoadMask.countPopulation()) == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.countPopulation() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.countTrailingZeros();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	EVT EltBaseVT = EltBase.getValueType();
	assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
	"Register/Memory size mismatch");
	LoadSDNode *LDBase = Loads[FirstLoadedElt];
	assert(LDBase && "Did not find base load for merging consecutive loads");
	unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
	unsigned BaseSizeInBytes = BaseSizeInBits / 8;
	int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
	assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");

	// TODO: Support offsetting the base load.
	if (ByteOffsets[FirstLoadedElt] != 0)
	return SDValue();

	// Check to see if the element's load is consecutive to the base load
	// or offset from a previous (already checked) load.
	auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
	LoadSDNode *Ld = Loads[EltIdx];
	int64_t ByteOffset = ByteOffsets[EltIdx];
	if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
	int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
	return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
	Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
	}
	return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
	EltIdx - FirstLoadedElt);
	};

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	if (!CheckConsecutiveLoad(LDBase, i)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(LDBase->isSimple() &&
	"Cannot merge volatile or atomic loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// Check if the base load is entirely dereferenceable.
	bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
	VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

	// LOAD - all consecutive load/undefs (must start/end with a load or be
	// entirely dereferenceable). If we have found an entire vector of loads and
	// undefs, then return a large load of the entire vector width starting at the
	// base pointer. If the vector contains zeros, then attempt to shuffle those
	// elements.
	if (FirstLoadedElt == 0 &&
	(LastLoadedElt == (int)(NumElems - 1) \|\| IsDereferenceable) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (NumElems == 1)
	return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

	if (!ZeroMask)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && VT.isVector()) {
	unsigned NumMaskElts = VT.getVectorNumElements();
	if ((NumMaskElts % NumElems) == 0) {
	unsigned Scale = NumMaskElts / NumElems;
	SmallVector<int, 4> ClearMask(NumMaskElts, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (UndefMask[i])
	continue;
	int Offset = ZeroMask[i] ? NumMaskElts : 0;
	for (unsigned j = 0; j != Scale; ++j)
	ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}
	}

	// If the upper half of a ymm/zmm load is undef then just load the lower half.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned HalfNumElems = NumElems / 2;
	if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
	SDValue HalfLD =
	EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
	DAG, Subtarget, isAfterLegalize);
	if (HalfLD)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
	HalfLD, DAG.getIntPtrConstant(0, DL));
	}
	}

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSizeInBits == 32 \|\| LoadSizeInBits == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
	: MVT::getIntegerVT(LoadSizeInBits);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// BROADCAST - match the smallest possible repetition pattern, load that
	// scalar/subvector element and then broadcast to the entire vector.
	if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
	(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector())) {
	for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
	unsigned RepeatSize = SubElems * BaseSizeInBits;
	unsigned ScalarSize = std::min(RepeatSize, 64u);
	if (!Subtarget.hasAVX2() && ScalarSize < 32)
	continue;

	bool Match = true;
	SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
	for (unsigned i = 0; i != NumElems && Match; ++i) {
	if (!LoadMask[i])
	continue;
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (RepeatedLoads[i % SubElems].isUndef())
	RepeatedLoads[i % SubElems] = Elt;
	else
	Match &= (RepeatedLoads[i % SubElems] == Elt);
	}

	// We must have loads at both ends of the repetition.
	Match &= !RepeatedLoads.front().isUndef();
	Match &= !RepeatedLoads.back().isUndef();
	if (!Match)
	continue;

	EVT RepeatVT =
	VT.isInteger() && (RepeatSize != 64 \|\| TLI.isTypeLegal(MVT::i64))
	? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
	: EVT::getFloatingPointVT(ScalarSize);
	if (RepeatSize > ScalarSize)
	RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
	RepeatSize / ScalarSize);
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
	VT.getSizeInBits() / ScalarSize);
	if (TLI.isTypeLegal(BroadcastVT)) {
	if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
	RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
	unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
	: X86ISD::VBROADCAST;
	SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
	return DAG.getBitcast(VT, Broadcast);
	}
	}
	}
	}

	return SDValue();
	}

	// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
	// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
	// are consecutive, non-overlapping, and in the right order.
	static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	SmallVector<SDValue, 64> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	return SDValue();
	}
	assert(Elts.size() == VT.getVectorNumElements());
	return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
	isAfterLegalize);
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isFoldableUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	unsigned Opc = U->getOpcode();
	// VPERMV/VPERMV3 shuffles can never fold their index operands.
	if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
	return false;
	if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
	return false;
	if (isTargetShuffle(Opc))
	return true;
	if (Opc == ISD::BITCAST) // Ignore bitcasts
	return isFoldableUseOfShuffle(U);
	if (N->hasOneUse())
	return true;
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this paterrn:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	MVT MaskVT = BOperand.getSimpleValueType();
	if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefElts = UndefElements.count();
	if (!Ld \|\| (NumElts - NumUndefElts) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isFoldableUseOfShuffle(BVOp))
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	// If we are moving a scalar into a vector (Ld must be set and all elements
	// but 1 are undef) and that operation is not obviously supported by
	// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
	// That's better than general shuffling and may eliminate a load to GPR and
	// move from scalar to vector register.
	if (!Ld \|\| NumElts - NumUndefElts != 1)
	return SDValue();
	unsigned ScalarSize = Ld.getValueSizeInBits();
	if (!(UndefElements[0] \|\| (ScalarSize != 32 && ScalarSize != 64)))
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.shouldOptForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()) \|\|
	ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat) {
	// The build_vector allows the scalar element to be larger than the vector
	// element type. We need to mask it to use as a condition unless we know
	// the upper bits are zero.
	// FIXME: Use computeKnownBits instead of checking specific opcode?
	SDValue Cond = Op.getOperand(SplatIdx);
	assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
	if (Cond.getOpcode() != ISD::SETCC)
	Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
	DAG.getConstant(1, dl, MVT::i8));
	return DAG.getSelect(dl, VT, Cond,
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));
	}

	// insert elements one by one
	SDValue DstVec;
	if (HasConstElts) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
	SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
	ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
	ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
	DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
	} else {
	MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
	SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
	MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
	DstVec = DAG.getBitcast(VecVT, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
	DAG.getIntPtrConstant(0, dl));
	}
	} else
	DstVec = DAG.getUNDEF(VT);

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
	/// may not match the layout of an x86 256-bit horizontal instruction.
	/// In other words, if this returns true, then some extraction/insertion will
	/// be required to produce a valid horizontal instruction.
	///
	/// Parameter \p Opcode defines the kind of horizontal operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	///
	/// TODO: This function was originally used to match both real and fake partial
	/// horizontal operations, but the index-matching logic is incorrect for that.
	/// See the corrected implementation in isHopBuildVector(). Can we reduce this
	/// code because it is only used for partial h-op matching now?
	static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);
	assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB/SUBADD operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
	/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
	/// \p Opnd0 and \p Opnd1.
	static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts,
	bool &IsSubAdd) {

	MVT VT = BV->getSimpleValueType(0);
	if (!Subtarget.hasSSE3() \|\| !VT.isFloatingPoint())
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding/subtracting two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting/adding two integer/float elements.
	unsigned Opc[2] = {0, 0};
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF)
	continue;

	// Early exit if we found an unexpected opcode.
	if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node, make sure its the same opcode as previous
	// elements for this parity.
	if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
	return false;
	Opc[i % 2] = Opcode;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (Opcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Ensure we have found an opcode for both parities and that they are
	// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
	// inputs are undef.
	if (!Opc[0] \|\| !Opc[1] \|\| Opc[0] == Opc[1] \|\|
	InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	IsSubAdd = Opc[0] == ISD::FADD;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
	/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
	/// X86ISD::FMSUBADD node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
	IsSubAdd))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	// We only support ADDSUB.
	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
	unsigned &HOpcode, SDValue &V0, SDValue &V1) {
	// Initialize outputs to known values.
	MVT VT = BV->getSimpleValueType(0);
	HOpcode = ISD::DELETED_NODE;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
	// half of the result is calculated independently from the 128-bit halves of
	// the inputs, so that makes the index-checking logic below more complicated.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned GenericOpcode = ISD::DELETED_NODE;
	unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
	unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
	unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
	for (unsigned i = 0; i != Num128BitChunks; ++i) {
	for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
	// Ignore undef elements.
	SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
	if (Op.isUndef())
	continue;

	// If there's an opcode mismatch, we're done.
	if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
	return false;

	// Initialize horizontal opcode.
	if (HOpcode == ISD::DELETED_NODE) {
	GenericOpcode = Op.getOpcode();
	switch (GenericOpcode) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default: return false;
	}
	}

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0.getOperand(0) != Op1.getOperand(0) \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\| !Op.hasOneUse())
	return false;

	// The source vector is chosen based on which 64-bit half of the
	// destination vector is being calculated.
	if (j < NumEltsIn64Bits) {
	if (V0.isUndef())
	V0 = Op0.getOperand(0);
	} else {
	if (V1.isUndef())
	V1 = Op0.getOperand(0);
	}

	SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
	if (SourceVec != Op0.getOperand(0))
	return false;

	// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
	unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
	unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
	unsigned ExpectedIndex = i * NumEltsIn128Bits +
	(j % NumEltsIn64Bits) * 2;
	if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
	continue;

	// If this is not a commutative op, this does not match.
	if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
	return false;

	// Addition is commutative, so try swapping the extract indexes.
	// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
	if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
	continue;

	// Extract indexes do not match horizontal requirement.
	return false;
	}
	}
	// We matched. Opcode and operands are returned by reference as arguments.
	return true;
	}

	static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
	SelectionDAG &DAG, unsigned HOpcode,
	SDValue V0, SDValue V1) {
	// If either input vector is not the same size as the build vector,
	// extract/insert the low bits to the correct size.
	// This is free (examples: zmm --> xmm, xmm --> ymm).
	MVT VT = BV->getSimpleValueType(0);
	unsigned Width = VT.getSizeInBits();
	if (V0.getValueSizeInBits() > Width)
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
	else if (V0.getValueSizeInBits() < Width)
	V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

	if (V1.getValueSizeInBits() > Width)
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
	else if (V1.getValueSizeInBits() < Width)
	V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

	unsigned NumElts = VT.getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	if (BV->getOperand(i).isUndef())
	DemandedElts.clearBit(i);

	// If we don't need the upper xmm, then perform as a xmm hop.
	unsigned HalfNumElts = NumElts / 2;
	if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
	SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
	return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
	}

	return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We need at least 2 non-undef elements to make this worthwhile by default.
	unsigned NumNonUndefs =
	count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
	if (NumNonUndefs < 2)
	return SDValue();

	// There are 4 sets of horizontal math operations distinguished by type:
	// int/FP at 128-bit/256-bit. Each type was introduced with a different
	// subtarget feature. Try to match those "native" patterns first.
	MVT VT = BV->getSimpleValueType(0);
	if (((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) \|\|
	((VT == MVT::v8i16 \|\| VT == MVT::v4i32) && Subtarget.hasSSSE3()) \|\|
	((VT == MVT::v8f32 \|\| VT == MVT::v4f64) && Subtarget.hasAVX()) \|\|
	((VT == MVT::v16i16 \|\| VT == MVT::v8i32) && Subtarget.hasAVX2())) {
	unsigned HOpcode;
	SDValue V0, V1;
	if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
	return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
	}

	// Try harder to match 256-bit ops by using extract/concat.
	if (!Subtarget.hasAVX() \|\| !VT.is256BitVector())
	return SDValue();

	// Count the number of UNDEF operands in the build_vector in input.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned Half = NumElts / 2;
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
	InVec1) &&
	isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binops followed by
	// a concat vector. We must adjust the outputs from the partial horizontal
	// matching calls above to account for undefined vector halves.
	SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
	SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
	assert((!V0.isUndef() \|\| !V1.isUndef()) && "Horizontal-op of undefs?");
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
	isUndefHI);
	}
	}

	if (VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) {
	unsigned X86Opcode;
	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	bool IsShift = false;
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	IsShift = true;
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();

	// Extend shift amounts.
	if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
	if (!IsShift)
	return SDValue();
	RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
	}

	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	// Limit to shifts by uniform immediates.
	// TODO: Only accept vXi8/vXi64 special cases?
	// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
	if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
	return SDValue();

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
	/// from a vector of source values and a vector of extraction indices.
	/// The vectors might be manipulated to match the type of the permute op.
	static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
	SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT ShuffleVT = VT;
	EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Adjust IndicesVec to match VT size.
	assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
	"Illegal variable permute mask size");
	if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
	IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
	NumElts * VT.getScalarSizeInBits());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

	// Handle SrcVec that don't match VT type.
	if (SrcVec.getValueSizeInBits() != SizeInBits) {
	if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
	// Handle larger SrcVec by treating it as a larger permute.
	unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
	VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
	IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
	Subtarget, DAG, SDLoc(IndicesVec));
	return extractSubVector(
	createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
	DAG, DL, SizeInBits);
	} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
	// Widen smaller SrcVec to match VT.
	SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
	} else
	return SDValue();
	}

	auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
	assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
	EVT SrcVT = Idx.getValueType();
	unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
	uint64_t IndexScale = 0;
	uint64_t IndexOffset = 0;

	// If we're scaling a smaller permute op, then we need to repeat the
	// indices, scaling and offsetting them as well.
	// e.g. v4i32 -> v16i8 (Scale = 4)
	// IndexScale = v4i32 Splat(4 << 24 \| 4 << 16 \| 4 << 8 \| 4)
	// IndexOffset = v4i32 Splat(3 << 24 \| 2 << 16 \| 1 << 8 \| 0)
	for (uint64_t i = 0; i != Scale; ++i) {
	IndexScale \|= Scale << (i * NumDstBits);
	IndexOffset \|= i << (i * NumDstBits);
	}

	Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
	Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
	return Idx;
	};

	unsigned Opcode = 0;
	switch (VT.SimpleTy) {
	default:
	break;
	case MVT::v16i8:
	if (Subtarget.hasSSSE3())
	Opcode = X86ISD::PSHUFB;
	break;
	case MVT::v8i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	if (Subtarget.hasAVX()) {
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v4f32;
	} else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	if (Subtarget.hasAVX()) {
	// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v2f64;
	} else if (Subtarget.hasSSE41()) {
	// SSE41 can compare v2i64 - select between indices 0 and 1.
	return DAG.getSelectCC(
	DL, IndicesVec,
	getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
	ISD::CondCode::SETEQ);
	}
	break;
	case MVT::v32i8:
	if (Subtarget.hasVLX() && Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasXOP()) {
	SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
	SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
	} else if (Subtarget.hasAVX()) {
	SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
	SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
	auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Permute Lo and Hi and then select based on index range.
	// This works as SHUFB uses bits[3:0] to permute elements and we don't
	// care about the bit[7] as its just an index vector.
	SDValue Idx = Ops[2];
	EVT VT = Idx.getValueType();
	return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
	ISD::CondCode::SETGT);
	};
	SDValue Ops[] = {LoLo, HiHi, IndicesVec};
	return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
	PSHUFBBuilder);
	}
	break;
	case MVT::v16i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	// Scale to v32i8 and perform as v32i8.
	IndicesVec = ScaleIndices(IndicesVec, 2);
	return DAG.getBitcast(
	VT, createVariablePermute(
	MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
	DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
	}
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (Subtarget.hasAVX2())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
	SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{0, 1, 2, 3, 0, 1, 2, 3});
	SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{4, 5, 6, 7, 4, 5, 6, 7});
	if (Subtarget.hasXOP())
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
	IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPS only uses index bits[0:1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
	SDLoc(SrcVec));
	IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
	DAG, SDLoc(IndicesVec));
	SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
	DAG, Subtarget);
	return extract256BitVector(Res, 0, DAG, DL);
	}
	Opcode = X86ISD::VPERMV;
	} else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
	SDValue LoLo =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
	SDValue HiHi =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
	// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	if (Subtarget.hasXOP())
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
	IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPD only uses index bit[1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v64i8:
	if (Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v32i16:
	if (Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8f64:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	Opcode = X86ISD::VPERMV;
	break;
	}
	if (!Opcode)
	return SDValue();

	assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
	(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
	"Illegal variable permute shuffle type");

	uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
	if (Scale > 1)
	IndicesVec = ScaleIndices(IndicesVec, Scale);

	EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
	IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

	SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
	SDValue Res = Opcode == X86ISD::VPERMV
	? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
	: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
	return DAG.getBitcast(VT, Res);
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getAPIntValue() != Idx)
	return SDValue();
	}

	SDLoc DL(V);
	MVT VT = V.getSimpleValueType();
	return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = EltVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, fall back to a shuffle to get the scalar blended with the
	// constants. Insertion into a zero vector is handled as a special-case
	// somewhere below here.
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	LLVMContext &Context = *DAG.getContext();
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
	unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
	if (InsertC < NumEltsInLow128Bits)
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

	// There's no good way to insert into the high elements of a >128-bit
	// vector, so use shuffles to avoid an extract/insert sequence.
	assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
	assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
	SmallVector<int, 8> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i)
	ShuffleMask.push_back(i == InsertC ? NumElts : i);
	SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
	return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	(EltVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	{
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// If this is a splat of pairs of 32-bit elements, we can use a narrower
	// build_vector and broadcast it.
	// TODO: We could probably generalize this more.
	if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
	// Make sure all the even/odd operands match.
	for (unsigned i = 2; i != NumElems; ++i)
	if (Ops[i % 2] != Op.getOperand(i))
	return false;
	return true;
	};
	if (CanSplat(Op, NumElems, Ops)) {
	MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
	MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
	// Create a new build vector and cast to v2i64/v2f64.
	SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
	DAG.getBuildVector(NarrowVT, dl, Ops));
	// Broadcast from v2i64/v2f64 and cast to final VT.
	MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
	NewBV));
	}
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.getSizeInBits() > 128) {
	MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	return concatSubVectors(Lower, Upper, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	// TODO: Detect subvector broadcast here instead of DAG combine?
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	unsigned NumOperands = Op.getNumOperands();
	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	unsigned NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= 1 << i;
	++NumNonZero;
	}
	}

	// If we have more than 2 non-zeros, build each half separately.
	if (NumNonZero > 2) {
	MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// Otherwise, build it up through insert_subvectors.
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);

	MVT SubVT = Op.getOperand(0).getSimpleValueType();
	unsigned NumSubElems = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumOperands; ++i) {
	if ((NonZeros & (1 << i)) == 0)
	continue;

	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
	Op.getOperand(i),
	DAG.getIntPtrConstant(i * NumSubElems, dl));
	}

	return Vec;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	// TODO: Merge this with LowerAVXCONCAT_VECTORS?
	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	uint64_t Zeros = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	Zeros \|= (uint64_t)1 << i;
	else
	NonZeros \|= (uint64_t)1 << i;
	}

	unsigned NumElems = ResVT.getVectorNumElements();

	// If we are inserting non-zero vector and there are zeros in LSBs and undef
	// in the MSBs we need to emit a KSHIFTL. The generic lowering to
	// insert_subvector will give us two kshifts.
	if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
	Log2_64(NonZeros) != NumOperands - 1) {
	MVT ShiftVT = ResVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	unsigned Idx = Log2_64(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
	DAG.getUNDEF(ShiftVT), SubVec,
	DAG.getIntPtrConstant(0, dl));
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
	DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
	DAG.getIntPtrConstant(0, dl));
	}

	// If there are zero or one non-zeros we can handle this very simply.
	if (NonZeros == 0 \|\| isPowerOf2_64(NonZeros)) {
	SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
	if (!NonZeros)
	return Vec;
	unsigned Idx = Log2_64(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// Test whether there are elements crossing LaneSizeInBits lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask) {
	assert(LaneSizeInBits && ScalarSizeInBits &&
	(LaneSizeInBits % ScalarSizeInBits) == 0 &&
	"Illegal shuffle lane size");
	int LaneSize = LaneSizeInBits / ScalarSizeInBits;
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
	}

	/// Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
	SmallVector<int, 32> RepeatedMask;
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in
	/// both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask,
	SDValue V1 = SDValue(),
	SDValue V2 = SDValue()) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;
	assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
	"Illegal target shuffle mask");

	// Check for out-of-range target shuffle mask indices.
	if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
	return false;

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
	BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
	BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] == SM_SentinelUndef \|\| Mask[i] == ExpectedMask[i])
	continue;
	if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (MaskBV && ExpectedBV &&
	MaskBV->getOperand(Mask[i] % Size) ==
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	continue;
	}
	// TODO - handle SM_Sentinel equivalences.
	return false;
	}
	return true;
	}

	// Attempt to create a shuffle mask from a VSELECT condition mask.
	static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
	SDValue Cond) {
	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return false;

	unsigned Size = Cond.getValueType().getVectorNumElements();
	Mask.resize(Size, SM_SentinelUndef);

	for (int i = 0; i != (int)Size; ++i) {
	SDValue CondElt = Cond.getOperand(i);
	Mask[i] = i;
	// Arbitrarily choose from the 2nd operand if the select condition element
	// is undef.
	// TODO: Can we do better by matching patterns such as even/odd?
	if (CondElt.isUndef() \|\| isNullConstant(CondElt))
	Mask[i] += Size;
	}

	return true;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
	// Create 128-bit vector type based on mask size.
	MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
	MVT VT = MVT::getVectorVT(EltVT, Mask.size());

	// We can't assume a canonical shuffle mask, so try the commuted version too.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);

	// Match any of unary/binary or low/high.
	for (unsigned i = 0; i != 4; ++i) {
	SmallVector<int, 16> UnpackMask;
	createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
	if (isTargetShuffleEquivalent(Mask, UnpackMask) \|\|
	isTargetShuffleEquivalent(CommutedMask, UnpackMask))
	return true;
	}
	return false;
	}

	/// Return true if a shuffle mask chooses elements identically in its top and
	/// bottom halves. For example, any splat mask has the same top and bottom
	/// halves. If an element is undefined in only one half of the mask, the halves
	/// are not considered identical.
	static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
	assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
	unsigned HalfSize = Mask.size() / 2;
	for (unsigned i = 0; i != HalfSize; ++i) {
	if (Mask[i] != Mask[i + HalfSize])
	return false;
	}
	return true;
	}

	/// Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
	}

	static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1, SDValue V2,
	SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
	int Delta) {
	int Size = (int)Mask.size();
	int Split = Size / Delta;
	int TruncatedVectorStart = SwappedOps ? Size : 0;

	// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
	if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
	return false;

	// The rest of the mask should not refer to the truncated vector's elements.
	if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
	TruncatedVectorStart + Size))
	return false;

	return true;
	}

	// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
	//
	// An example is the following:
	//
	// t0: ch = EntryToken
	// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
	// t25: v4i32 = truncate t2
	// t41: v8i16 = bitcast t25
	// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
	// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
	// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
	// t18: v2i64 = bitcast t51
	//
	// Without avx512vl, this is lowered to:
	//
	// vpmovqd %zmm0, %ymm0
	// vpshufb {{.*#+}} xmm0 =
	// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
	//
	// But when avx512vl is available, one can just use a single vpmovdw
	// instruction.
	static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (VT != MVT::v16i8 && VT != MVT::v8i16)
	return SDValue();

	if (Mask.size() != VT.getVectorNumElements())
	return SDValue();

	bool SwappedOps = false;

	if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
	if (!ISD::isBuildVectorAllZeros(V1.getNode()))
	return SDValue();

	std::swap(V1, V2);
	SwappedOps = true;
	}

	// Look for:
	//
	// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
	// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
	//
	// and similar ones.
	if (V1.getOpcode() != ISD::BITCAST)
	return SDValue();
	if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue Src = V1.getOperand(0).getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// The vptrunc** instructions truncating 128 bit and 256 bit vectors
	// are only available with avx512vl.
	if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
	return SDValue();

	// Down Convert Word to Byte is only available with avx512bw. The case with
	// 256-bit output doesn't contain a shuffle and is therefore not handled here.
	if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
	!Subtarget.hasBWI())
	return SDValue();

	// The first half/quarter of the mask should refer to every second/fourth
	// element of the vector truncated and bitcasted.
	if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
	!matchShuffleAsVPMOV(Mask, SwappedOps, 4))
	return SDValue();

	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
	unsigned &PackOpcode, ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

	auto MatchPACK = [&](SDValue N1, SDValue N2) {
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if (Subtarget.hasSSE41() \|\| PackSVT == MVT::i16) {
	APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > BitSize) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > BitSize)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}
	return false;
	};

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
	if (MatchPACK(V1, V2))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
	if (MatchPACK(V1, V1))
	return true;

	return false;
	}

	static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget))
	return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
	DAG.getBitcast(PackVT, V2));

	return SDValue();
	}

	/// Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT MaskVT = VT;
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero, AllOnes;
	// Use f64 if i64 isn't legal.
	if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
	EltVT = MVT::f64;
	MaskVT = MVT::getVectorVT(EltVT, Mask.size());
	}

	MVT LogicVT = VT;
	if (EltVT == MVT::f32 \|\| EltVT == MVT::f64) {
	Zero = DAG.getConstantFP(0.0, DL, EltVT);
	AllOnes = DAG.getConstantFP(
	APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
	LogicVT =
	MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
	} else {
	Zero = DAG.getConstant(0, DL, EltVT);
	AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	}

	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
	VMask = DAG.getBitcast(LogicVT, VMask);
	V = DAG.getBitcast(LogicVT, V);
	SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
	return DAG.getBitcast(VT, And);
	}

	/// Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> Mask,
	const APInt &Zeroable, bool &ForceV1Zero,
	bool &ForceV2Zero, uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (Zeroable[i]) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	Mask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	Mask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 64> Mask(Original.begin(), Original.end());
	if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v8f32:
	assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
	LLVM_FALLTHROUGH;
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getTargetConstant(BlendMask, DL, MVT::i8));
	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(BlendMask, DL, MVT::i8));
	}
	// Use PBLENDW for lower/upper lanes and then blend lanes.
	// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
	// merge to VSELECT where useful.
	uint64_t LoMask = BlendMask & 0xFF;
	uint64_t HiMask = (BlendMask >> 8) & 0xFF;
	if (LoMask == 0 \|\| LoMask == 255 \|\| HiMask == 0 \|\| HiMask == 255) {
	SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(LoMask, DL, MVT::i8));
	SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(HiMask, DL, MVT::i8));
	return DAG.getVectorShuffle(
	MVT::v16i16, DL, Lo, Hi,
	{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v32i8:
	assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v16i8: {
	assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// x86 allows load folding with blendvb from the 2nd source operand. But
	// we are still using LLVM select here (see comment below), so that's V1.
	// If V2 can be load-folded and V1 cannot be load-folded, then commute to
	// allow that load-folding possibility.
	if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
	bool OptForSize = DAG.shouldOptForSize();
	if (!OptForSize) {
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;
	}

	// Otherwise load an immediate into a GPR, cast to k-register, and use a
	// masked move.
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	bool ImmBlends = false) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	// If only immediate blends, then bail if the blend mask can't be widened to
	// i16.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
	return SDValue();

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Try to lower as an unpack of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can unpack elements from two inputs and
	/// then reduce the shuffle to a single-input (wider) permutation.
	static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;
	int NumHalfLaneElts = NumLaneElts / 2;

	bool MatchLo = true, MatchHi = true;
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

	// Determine UNPCKL/UNPCKH type and operand order.
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;

	SDValue &Op = Ops[Elt & 1];
	if (M < NumElts && (Op.isUndef() \|\| Op == V1))
	Op = V1;
	else if (NumElts <= M && (Op.isUndef() \|\| Op == V2))
	Op = V2;
	else
	return SDValue();

	int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
	MatchLo &= isUndefOrInRange(M, Lo, Mid) \|\|
	isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
	MatchHi &= isUndefOrInRange(M, Mid, Hi) \|\|
	isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
	if (!MatchLo && !MatchHi)
	return SDValue();
	}
	}
	assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");

	// Now check that each pair of elts come from the same unpack pair
	// and set the permute mask based on each pair.
	// TODO - Investigate cases where we permute individual elements.
	SmallVector<int, 32> PermuteMask(NumElts, -1);
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
	int M0 = Mask[Lane + Elt + 0];
	int M1 = Mask[Lane + Elt + 1];
	if (0 <= M0 && 0 <= M1 &&
	(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
	return SDValue();
	if (0 <= M0)
	PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
	if (0 <= M1)
	PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
	}
	}

	unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
	return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
	/// permuting the elements of the result in place.
	static SDValue lowerShuffleAsByteRotateAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) \|\|
	(VT.is256BitVector() && !Subtarget.hasAVX2()) \|\|
	(VT.is512BitVector() && !Subtarget.hasBWI()))
	return SDValue();

	// We don't currently support lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	int Scale = VT.getScalarSizeInBits() / 8;
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = VT.getVectorNumElements();
	int NumEltsPerLane = NumElts / NumLanes;

	// Determine range of mask elts.
	bool Blend1 = true;
	bool Blend2 = true;
	std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
	std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts) {
	Blend1 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range1.first = std::min(Range1.first, M);
	Range1.second = std::max(Range1.second, M);
	} else {
	M -= NumElts;
	Blend2 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range2.first = std::min(Range2.first, M);
	Range2.second = std::max(Range2.second, M);
	}
	}
	}

	// Bail if we don't need both elements.
	// TODO - it might be worth doing this for unary shuffles if the permute
	// can be widened.
	if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) \|\|
	!(0 <= Range2.first && Range2.second < NumEltsPerLane))
	return SDValue();

	if (VT.getSizeInBits() > 128 && (Blend1 \|\| Blend2))
	return SDValue();

	// Rotate the 2 ops so we can access both ranges, then permute the result.
	auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue Rotate = DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
	DAG.getBitcast(ByteVT, Lo),
	DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
	SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts)
	PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
	else
	PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
	}
	}
	return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
	};

	// Check if the ranges are small enough to rotate from either direction.
	if (Range2.second < Range1.first)
	return RotateAndPermute(V1, V2, Range1.first, 0);
	if (Range1.second < Range2.first)
	return RotateAndPermute(V2, V1, Range2.first, NumElts);
	return SDValue();
	}

	/// Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerShuffleAsDecomposedShuffleBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend/unpack/rotate strategies unless
	// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
	// the shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
	// pre-shuffle first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
	// Only prefer immediate blends to unpack/rotate.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG, true))
	return BlendPerm;
	if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return UnpackPerm;
	if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
	DL, VT, V1, V2, Mask, Subtarget, DAG))
	return RotatePerm;
	// Unpack/rotate failed - try again with variable blends.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return BlendPerm;
	}

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift =
	DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift =
	DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getTargetConstant(Rotation, DL, MVT::i8));
	}

	/// Try to lower a vector shuffle as a byte shift sequence.
	static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(VT.is128BitVector() && "Only 128-bit vectors supported");

	// We need a shuffle that has zeros at one/both ends and a sequential
	// shuffle from one source within.
	unsigned ZeroLo = Zeroable.countTrailingOnes();
	unsigned ZeroHi = Zeroable.countLeadingOnes();
	if (!ZeroLo && !ZeroHi)
	return SDValue();

	unsigned NumElts = Mask.size();
	unsigned Len = NumElts - (ZeroLo + ZeroHi);
	if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
	return SDValue();

	unsigned Scale = VT.getScalarSizeInBits() / 8;
	ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
	if (!isUndefOrInRange(StubMask, 0, NumElts) &&
	!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
	return SDValue();

	SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
	Res = DAG.getBitcast(MVT::v16i8, Res);

	// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
	// inner sequential set of elements, possibly offset:
	// 01234567 --> zzzzzz01 --> 1zzzzzzz
	// 01234567 --> 4567zzzz --> zzzzz456
	// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
	if (ZeroLo == 0) {
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
	} else if (ZeroHi == 0) {
	unsigned Shift = Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
	} else if (!Subtarget.hasSSSE3()) {
	// If we don't have PSHUFB then its worth avoiding an AND constant mask
	// by performing 3 byte shifts. Shuffle combining can kick in above that.
	// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Shift += Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
	} else
	return SDValue();

	return DAG.getBitcast(VT, Res);
	}

	/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));

	if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid a/zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
	ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getTargetConstant(EltBits, DL, MVT::i8),
	DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

	if (isUndefUpperHalf(Mask) \|\| !SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getTargetConstant(EltBits, DL, MVT::i8),
	DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	if ((i % Scale == 0 && SafeOffset(Idx))) {
	PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
	continue;
	}
	PSHUFBMask[i] =
	AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
	InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getTargetConstant(
	V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
	int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// If we are extracting two 128-bit halves of a vector and shuffling the
	/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
	/// multi-shuffle lowering.
	static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
	SDValue N1, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	EVT VT = N0.getValueType();
	assert((VT.is128BitVector() &&
	(VT.getScalarSizeInBits() == 32 \|\| VT.getScalarSizeInBits() == 64)) &&
	"VPERM* family of shuffles requires 32-bit or 64-bit elements");

	// Check that both sources are extracts of the same source vector.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse() \|\|
	N0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N0.getOperand(0) != N1.getOperand(0))
	return SDValue();

	SDValue WideVec = N0.getOperand(0);
	EVT WideVT = WideVec.getValueType();
	if (!WideVT.is256BitVector() \|\| !isa<ConstantSDNode>(N0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(N1.getOperand(1)))
	return SDValue();

	// Match extracts of each half of the wide source vector. Commute the shuffle
	// if the extract of the low half is N1.
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
	const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
	if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
	ShuffleVectorSDNode::commuteMask(NewMask);
	else if (ExtIndex0 != 0 \|\| ExtIndex1 != NumElts)
	return SDValue();

	// Final bailout: if the mask is simple, we are better off using an extract
	// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
	// because that avoids a constant load from memory.
	if (NumElts == 4 &&
	(isSingleSHUFPSMask(NewMask) \|\| is128BitUnpackShuffleMask(NewMask)))
	return SDValue();

	// Extend the shuffle mask with undef elements.
	NewMask.append(NumElts, -1);

	// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
	SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
	NewMask);
	// This is free: ymm -> xmm.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	int BitOffset = BroadcastIdx * NumEltBits;
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	V = V.getOperand(0);
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OpBitWidth = V.getOperand(0).getValueSizeInBits();
	int OpIdx = BitOffset / OpBitWidth;
	V = V.getOperand(OpIdx);
	BitOffset %= OpBitWidth;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int EltBitWidth = VOuter.getScalarValueSizeInBits();
	int Idx = (int)ConstantIdx->getZExtValue();
	int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
	int BeginOffset = Idx * EltBitWidth;
	int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
	if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
	BitOffset -= BeginOffset;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}
	assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
	BroadcastIdx = BitOffset / NumEltBits;

	// Do we need to bitcast the source to retrieve the original broadcast index?
	bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// If the original value has a larger element type than the shuffle, the
	// broadcast element is in essence truncated. Make that explicit to ease
	// folding.
	if (BitCastSrc && VT.isInteger())
	if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
	DL, VT, V, BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Also check the simpler case, where we can directly reuse the scalar.
	if (!BitCastSrc &&
	((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: Opcode;
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(V);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BitOffset != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	if ((BitOffset % 128) != 0)
	return SDValue();

	assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
	"Unexpected bit-offset");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
	V = extract128BitVector(V, ExtractIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
	assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	MVT ExtVT;
	if (V.getValueType().isVector()) {
	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	ExtVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(ExtVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (V.getValueSizeInBits() > 128) {
	MVT ExtVT = V.getSimpleValueType().getScalarType();
	ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
	V = DAG.getBitcast(ExtVT, V);
	}

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> Mask, const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	/// Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerShuffleAsPermuteAndUnpack(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If we're shuffling with a zero vector then we're better off not doing
	// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
	if (ISD::isBuildVectorAllZeros(V1.getNode()) \|\|
	ISD::isBuildVectorAllZeros(V2.getNode()))
	return SDValue();

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	X86ISD::MOVSD, DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
	V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Try to use broadcast unless the mask only has one non-undef element.
	if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;
	}

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord = 0, BDWord = 0;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
	assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
	"Lane crossing shuffle masks not supported");

	int NumBytes = VT.getSizeInBits() / 8;
	int Size = Mask.size();
	int Scale = NumBytes / Size;

	SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	V1InUse = false;
	V2InUse = false;

	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Scale];
	if (M < 0)
	continue;

	const int ZeroMask = 0x80;
	int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
	int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;

	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}

	MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
	DAG.getBuildVector(ShufVT, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
	DAG.getBuildVector(ShufVT, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
	Subtarget, DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG);
	}

	/// Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	bool EvenInUse = false, OddInUse = false;
	for (int i = 0; i < 16; i += 2) {
	EvenInUse \|= (Mask[i + 0] >= 0);
	OddInUse \|= (Mask[i + 1] >= 0);
	if (EvenInUse && OddInUse)
	break;
	}
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
	OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);

	// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
	// PALIGNR will be cheaper than the second PSHUFB+OR.
	if (SDValue V = lowerShuffleAsByteRotateAndPermute(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return V;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Blend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
	for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
	ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
	SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	Subtarget, DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
	DAG);
	}

	// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// TODO: Extend to support v8f32 (+ 512-bit shuffles).
	static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");

	int LHSMask[4] = {-1, -1, -1, -1};
	int RHSMask[4] = {-1, -1, -1, -1};
	unsigned SHUFPMask = 0;

	// As SHUFPD uses a single LHS/RHS element per lane, we can always
	// perform the shuffle once the lanes have been shuffled in place.
	for (int i = 0; i != 4; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	int LaneBase = i & ~1;
	auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
	LaneMask[LaneBase + (M & 1)] = M;
	SHUFPMask \|= (M & 1) << i;
	}

	SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
	SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
	DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a lane permutation followed by a per-lane permutation.
	///
	/// This is mainly for cases where we can have non-repeating permutes
	/// in each lane.
	///
	/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
	/// we should investigate merging them.
	static SDValue lowerShuffleAsLanePermuteAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumEltsPerLane = NumElts / NumLanes;

	SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
	SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Ensure that each lane comes from a single source lane.
	int SrcLane = M / NumEltsPerLane;
	int DstLane = i / NumEltsPerLane;
	if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
	return SDValue();
	SrcLaneMask[DstLane] = SrcLane;

	PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
	}

	// Make sure we set all elements of the lane mask, to avoid undef propagation.
	SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
	for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
	int SrcLane = SrcLaneMask[DstLane];
	if (0 <= SrcLane)
	for (int j = 0; j != NumEltsPerLane; ++j) {
	LaneMask[(DstLane * NumEltsPerLane) + j] =
	(SrcLane * NumEltsPerLane) + j;
	}
	}

	// If we're only shuffling a single lowest lane and the rest are identity
	// then don't bother.
	// TODO - isShuffleMaskInputInPlace could be extended to something like this.
	int NumIdentityLanes = 0;
	bool OnlyShuffleLowestLane = true;
	for (int i = 0; i != NumLanes; ++i) {
	if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
	i * NumEltsPerLane))
	NumIdentityLanes++;
	else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
	OnlyShuffleLowestLane = false;
	}
	if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
	return SDValue();

	SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
	return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
	/// source with a lane permutation.
	///
	/// This lowering strategy results in four instructions in the worst case for a
	/// single-input cross lane shuffle which is lower than any other fully general
	/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
	/// shuffle pattern should be handled prior to trying this lowering.
	static SDValue lowerShuffleAsLanePermuteAndShuffle(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// Only do this if the elements aren't all from the lower lane,
	// otherwise we're (probably) better off doing a split.
	if (VT == MVT::v4f64 &&
	!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
	if (SDValue V =
	lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
	return V;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	// TODO - we could support shuffling V2 in the Flipped input.
	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
	for (int i = 0; i < Size; ++i) {
	int &M = InLaneMask[i];
	if (M < 0)
	continue;
	if (((M % Size) / LaneSize) != (i / LaneSize))
	M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
	}
	assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
	"In-lane shuffle mask expected");

	// Flip the lanes, and shuffle the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped =
	DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
	}

	/// Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
	return SDValue();

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && IsHighZero) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Blend;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(2, DL));
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert((WidenedMask[0] >= 0 \|\| IsLowZero) &&
	(WidenedMask[1] >= 0 \|\| IsHighZero) && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This attempts to create a repeated lane shuffle where each lane uses one
	/// or two of the lanes of the inputs. The lanes of the input vectors are
	/// shuffled in one or two independent shuffles to get the lanes into the
	/// position needed by the final shuffle.
	static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	if (is128BitLaneRepeatedShuffleMask(VT, Mask))
	return SDValue();

	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = 128 / VT.getScalarSizeInBits();
	SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
	SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

	// First pass will try to fill in the RepeatMask from lanes that need two
	// sources.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Srcs[2] = {-1, -1};
	SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = Mask[(Lane * NumLaneElts) + i];
	if (M < 0)
	continue;
	// Determine which of the possible input lanes (NumLanes from each source)
	// this element comes from. Assign that as one of the sources for this
	// lane. We can assign up to 2 sources for this lane. If we run out
	// sources we can't do anything.
	int LaneSrc = M / NumLaneElts;
	int Src;
	if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc)
	Src = 0;
	else if (Srcs[1] < 0 \|\| Srcs[1] == LaneSrc)
	Src = 1;
	else
	return SDValue();

	Srcs[Src] = LaneSrc;
	InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
	}

	// If this lane has two sources, see if it fits with the repeat mask so far.
	if (Srcs[1] < 0)
	continue;

	LaneSrcs[Lane][0] = Srcs[0];
	LaneSrcs[Lane][1] = Srcs[1];

	auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
	assert(M1.size() == M2.size() && "Unexpected mask size");
	for (int i = 0, e = M1.size(); i != e; ++i)
	if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
	return false;
	return true;
	};

	auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
	assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
	for (int i = 0, e = MergedMask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	assert((MergedMask[i] < 0 \|\| MergedMask[i] == M) &&
	"Unexpected mask element");
	MergedMask[i] = M;
	}
	};

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Didn't find a match. Swap the operands and try again.
	std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
	ShuffleVectorSDNode::commuteMask(InLaneMask);

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Couldn't find a match with the operands in either order.
	return SDValue();
	}

	// Now handle any lanes with only one source.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	// If this lane has already been processed, skip it.
	if (LaneSrcs[Lane][0] >= 0)
	continue;

	for (int i = 0; i != NumLaneElts; ++i) {
	int M = Mask[(Lane * NumLaneElts) + i];
	if (M < 0)
	continue;

	// If RepeatMask isn't defined yet we can define it ourself.
	if (RepeatMask[i] < 0)
	RepeatMask[i] = M % NumLaneElts;

	if (RepeatMask[i] < NumElts) {
	if (RepeatMask[i] != M % NumLaneElts)
	return SDValue();
	LaneSrcs[Lane][0] = M / NumLaneElts;
	} else {
	if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
	return SDValue();
	LaneSrcs[Lane][1] = M / NumLaneElts;
	}
	}

	if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
	return SDValue();
	}

	SmallVector<int, 16> NewMask(NumElts, -1);
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][0];
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * NumLaneElts + i;
	NewMask[Lane * NumLaneElts + i] = M;
	}
	}
	SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV1) &&
	cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
	return SDValue();

	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][1];
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * NumLaneElts + i;
	NewMask[Lane * NumLaneElts + i] = M;
	}
	}
	SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV2) &&
	cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
	return SDValue();

	for (int i = 0; i != NumElts; ++i) {
	NewMask[i] = RepeatMask[i % NumLaneElts];
	if (NewMask[i] < 0)
	continue;

	NewMask[i] += (i / NumLaneElts) * NumLaneElts;
	}
	return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
	}

	/// If the input shuffle mask results in a vector that is undefined in all upper
	/// or lower half elements and that mask accesses only 2 halves of the
	/// shuffle's operands, return true. A mask of half the width with mask indexes
	/// adjusted to access the extracted halves of the original shuffle operands is
	/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
	/// lower half of each input operand is accessed.
	static bool
	getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
	int &HalfIdx1, int &HalfIdx2) {
	assert((Mask.size() == HalfMask.size() * 2) &&
	"Expected input mask to be twice as long as output");

	// Exactly one half of the result must be undef to allow narrowing.
	bool UndefLower = isUndefLowerHalf(Mask);
	bool UndefUpper = isUndefUpperHalf(Mask);
	if (UndefLower == UndefUpper)
	return false;

	unsigned HalfNumElts = HalfMask.size();
	unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
	HalfIdx1 = -1;
	HalfIdx2 = -1;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + MaskIndexOffset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return false;
	}

	return true;
	}

	/// Given the output values from getHalfShuffleMask(), create a half width
	/// shuffle of extracted vectors followed by an insert back to full width.
	static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> HalfMask, int HalfIdx1,
	int HalfIdx2, bool UndefLower,
	SelectionDAG &DAG, bool UseConcat = false) {
	assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
	assert(V1.getValueType().isSimple() && "Expecting only simple types");

	MVT VT = V1.getSimpleValueType();
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	unsigned HalfNumElts = HalfVT.getVectorNumElements();

	auto getHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
	SDValue Half1 = getHalfVector(HalfIdx1);
	SDValue Half2 = getHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	if (UseConcat) {
	SDValue Op0 = V;
	SDValue Op1 = DAG.getUNDEF(HalfVT);
	if (UndefLower)
	std::swap(Op0, Op1);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
	}

	unsigned Offset = UndefLower ? HalfNumElts : 0;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	bool UndefLower = isUndefLowerHalf(Mask);
	if (!UndefLower && !isUndefUpperHalf(Mask))
	return SDValue();

	assert((!UndefLower \|\| !isUndefUpperHalf(Mask)) &&
	"Completely undef shuffle mask should have been simplified already");

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	unsigned HalfNumElts = HalfVT.getVectorNumElements();
	if (!UndefLower &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
	return SDValue();

	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	unsigned NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	unsigned NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);
	assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");

	// Determine the larger pattern of undef/halves, then decide if it's worth
	// splitting the shuffle based on subtarget capabilities and types.
	unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
	if (!UndefLower) {
	// XXXXuuuu: no insert is needed.
	// Always extract lowers when setting lower - these are all free subreg ops.
	if (NumUpperHalves == 0)
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);

	if (NumUpperHalves == 1) {
	// AVX2 has efficient 32/64-bit element cross-lane shuffles.
	if (Subtarget.hasAVX2()) {
	// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
	if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
	!is128BitUnpackShuffleMask(HalfMask) &&
	(!isSingleSHUFPSMask(HalfMask) \|\|
	Subtarget.hasFastVariableShuffle()))
	return SDValue();
	// If this is a unary shuffle (assume that the 2nd operand is
	// canonicalized to undef), then we can use vpermpd. Otherwise, we
	// are better off extracting the upper half of 1 operand and using a
	// narrow shuffle.
	if (EltWidth == 64 && V2.isUndef())
	return SDValue();
	}
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Extract + narrow shuffle is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// Don't extract both uppers, instead shuffle and then extract.
	assert(NumUpperHalves == 2 && "Half vector count went wrong");
	return SDValue();
	}

	// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
	if (NumUpperHalves == 0) {
	// AVX2 has efficient 64-bit element cross-lane shuffles.
	// TODO: Refine to account for unary shuffle, splat, and other masks?
	if (Subtarget.hasAVX2() && EltWidth == 64)
	return SDValue();
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Narrow shuffle + insert is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
	return SDValue();
	}

	/// Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	bool &ForceV1Zero, bool &ForceV2Zero,
	unsigned &ShuffleImm, ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");
	assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
	"Illegal shuffle mask");

	bool ZeroLane[2] = { true, true };
	for (int i = 0; i < NumElts; ++i)
	ZeroLane[i & 1] &= Zeroable[i];

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef \|\| ZeroLane[i & 1])
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (!ShufpdMask && !CommutableMask)
	return false;

	if (!ShufpdMask && CommutableMask)
	std::swap(V1, V2);

	ForceV1Zero = ZeroLane[0];
	ForceV2Zero = ZeroLane[1];
	return true;
	}

	static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64) &&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
	Mask, Zeroable))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getTargetConstant(Immediate, DL, MVT::i8));
	}

	// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
	// by zeroable elements in the remaining 24 elements. Turn this into two
	// vmovqb instructions shuffled together.
	static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(VT == MVT::v32i8 && "Unexpected type!");

	// The first 8 indices should be every 8th element.
	if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
	return SDValue();

	// Remaining elements need to be zeroable.
	if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
	return SDValue();

	V1 = DAG.getBitcast(MVT::v4i64, V1);
	V2 = DAG.getBitcast(MVT::v4i64, V2);

	V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
	V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

	// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
	// the upper bits of the result using an unpckldq.
	SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
	{ 0, 1, 2, 3, 16, 17, 18, 19,
	4, 5, 6, 7, 20, 21, 22, 23 });
	// Insert the unpckldq into a zero vector to widen to v32i8.
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
	DAG.getConstant(0, DL, MVT::v32i8), Unpack,
	DAG.getIntPtrConstant(0, DL));
	}


	/// Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
	Mask, DAG, Subtarget))
	return V;

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Op;

	// If we have lane crossing shuffles AND they don't all come from the lower
	// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
	// canonicalize to a blend of splat which isn't necessary for this combine.
	if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
	!all_of(Mask, [](int M) { return M < 2 \|\| (4 <= M && M < 6); }) &&
	(V1.getOpcode() != ISD::BUILD_VECTOR) &&
	(V2.getOpcode() != ISD::BUILD_VECTOR))
	if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
	Mask, DAG))
	return Op;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
	DAG, Subtarget);
	}

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
	DAG, Subtarget);
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
	// by zeroable elements in the remaining 24 elements. Turn this into two
	// vmovqb instructions shuffled together.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
	Mask, Zeroable, DAG))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
	SDValue V1, SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
	(WidenedMask[1] == 1 \|\| (Zeroable & 0x0c) == 0x0c)) {
	unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(4, DL));
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}

	/// Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Op;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (!V2.isUndef())
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// FIXME: Implement direct support for this type!
	return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Shuffle should be unary.
	if (!V2.isUndef())
	return SDValue();

	int ShiftAmt = -1;
	int NumElts = Mask.size();
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < NumElts)) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// The first non-undef element determines our shift amount.
	if (ShiftAmt < 0) {
	ShiftAmt = M - i;
	// Need to be shifting right.
	if (ShiftAmt <= 0)
	return SDValue();
	}
	// All non-undef elements must shift by the same amount.
	if (ShiftAmt != M - i)
	return SDValue();
	}
	assert(ShiftAmt >= 0 && "All undef?");

	// Great we found a shift right.
	MVT WideVT = VT;
	if ((!Subtarget.hasDQI() && NumElts == 8) \|\| NumElts < 8)
	WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
	DAG.getUNDEF(WideVT), V1,
	DAG.getIntPtrConstant(0, DL));
	Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	// Determine if this shuffle can be implemented with a KSHIFT instruction.
	// Returns the shift amount if possible or -1 if not. This is a simplified
	// version of matchShuffleAsShift.
	static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable) {
	int Size = Mask.size();

	auto CheckZeros = [&](int Shift, bool Left) {
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, bool Left) {
	unsigned Pos = Left ? Shift : 0;
	unsigned Low = Left ? 0 : Shift;
	unsigned Len = Size - Shift;
	return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
	};

	for (int Shift = 1; Shift != Size; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
	Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
	return Shift;
	}

	return -1;
	}


	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");

	int NumElts = Mask.size();

	// Try to recognize shuffles that are just padding a subvector with zeros.
	int SubvecElts = 0;
	int Src = -1;
	for (int i = 0; i != NumElts; ++i) {
	if (Mask[i] >= 0) {
	// Grab the source from the first valid mask. All subsequent elements need
	// to use this same source.
	if (Src < 0)
	Src = Mask[i] / NumElts;
	if (Src != (Mask[i] / NumElts) \|\| (Mask[i] % NumElts) != i)
	break;
	}

	++SubvecElts;
	}
	assert(SubvecElts != NumElts && "Identity shuffle?");

	// Clip to a power 2.
	SubvecElts = PowerOf2Floor(SubvecElts);

	// Make sure the number of zeroable bits in the top at least covers the bits
	// not covered by the subvector.
	if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
	assert(Src >= 0 && "Expected a source!");
	MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
	Src == 0 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	DAG.getConstant(0, DL, VT),
	Extract, DAG.getIntPtrConstant(0, DL));
	}

	// Try a simple shift right with undef elements. Later we'll try with zeros.
	if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
	DAG))
	return Shift;

	// Try to match KSHIFTs.
	unsigned Offset = 0;
	for (SDValue V : { V1, V2 }) {
	unsigned Opcode;
	int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
	if (ShiftAmt >= 0) {
	MVT WideVT = VT;
	if ((!Subtarget.hasDQI() && NumElts == 8) \|\| NumElts < 8)
	WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
	DAG.getUNDEF(WideVT), V,
	DAG.getIntPtrConstant(0, DL));
	// Widened right shifts need two shifts to ensure we shift in zeroes.
	if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
	int WideElts = WideVT.getVectorNumElements();
	// Shift left to put the original vector in the MSBs of the new size.
	Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
	DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
	// Increase the shift amount to account for the left shift.
	ShiftAmt += WideElts - NumElts;
	}

	Res = DAG.getNode(Opcode, DL, WideVT, Res,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	Offset += NumElts; // Increment for next iteration.
	}



	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
	break;
	case MVT::v32i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	assert(Subtarget.hasBWI() && "Expected AVX512BW support");
	ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
	break;
	case MVT::v64i1:
	// Fall back to scalarization. FIXME: We can do better if the shuffle
	// can be partitioned cleanly.
	if (!Subtarget.useBWIRegs())
	return SDValue();
	ExtVT = MVT::v64i8;
	break;
	}

	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
	Shuffle, ISD::SETGT);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> OrigMask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef &&
	any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
	SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
	(void)MaskUpperLimit;
	assert(llvm::all_of(OrigMask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt KnownUndef, KnownZero;
	computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

	APInt Zeroable = KnownUndef \| KnownZero;
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
	// Shuffle mask widening should not interfere with a broadcast opportunity
	// by obfuscating the operands with bitcasts.
	// TODO: Avoid lowering directly from this top-level function: make this
	// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
	Subtarget, DAG))
	return Broadcast;

	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	int NewNumElts = NumElements / 2;
	MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	if (V2IsZero) {
	// Modify the new Mask to take all zeros from the all-zero vector.
	// Choose indices that are blend-friendly.
	bool UsedZeroVector = false;
	assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
	"V2's non-undef elements are used?!");
	for (int i = 0; i != NewNumElts; ++i)
	if (WidenedMask[i] == SM_SentinelZero) {
	WidenedMask[i] = i + NewNumElts;
	UsedZeroVector = true;
	}
	// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
	// some elements to be undef.
	if (UsedZeroVector)
	V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
	}
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
	if (canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
	return V;

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is256BitVector())
	return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is512BitVector())
	return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (Is1BitVector)
	return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

	return SDValue();
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);

	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	MVT CondVT = Cond.getSimpleValueType();
	unsigned CondEltSize = Cond.getScalarValueSizeInBits();
	if (CondEltSize == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned NumElts = VT.getVectorNumElements();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	// Build a mask by testing the condition against zero.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
	DAG.getConstant(0, dl, CondVT),
	ISD::SETNE);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, LHS, RHS);
	}

	// SEXT/TRUNC cases where the mask doesn't match the destination size.
	if (CondEltSize != EltSize) {
	// If we don't have a sign splat, rely on the expansion.
	if (CondEltSize != DAG.ComputeNumSignBits(Cond))
	return SDValue();

	MVT NewCondSVT = MVT::getIntegerVT(EltSize);
	MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
	Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
	return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16: {
	// Bitcast everything to the vXi8 type and use a vXi8 vselect.
	MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
	Cond = DAG.getBitcast(CastVT, Cond);
	LHS = DAG.getBitcast(CastVT, LHS);
	RHS = DAG.getBitcast(CastVT, RHS);
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	// Extend to natively supported kshift.
	unsigned NumElems = VecVT.getVectorNumElements();
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(IdxVal, dl));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	// Copy into a k-register, extract to v1i1 and insert_subvector.
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
	Op.getOperand(2));
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);

	auto *N2C = dyn_cast<ConstantSDNode>(N2);
	if (!N2C \|\| N2C->getAPIntValue().uge(NumElts))
	return SDValue();
	uint64_t IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
	DAG.getTargetConstant(1, dl, MVT::i8));
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getIntPtrConstant(IdxIn128, dl));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// This will be just movd/movq/movss/movsd.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
	(EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	EltVT == MVT::i64)) {
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	}

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
	DAG.getTargetConstant(1, dl, MVT::i8));
	}
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
	DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
	"Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(
	const GlobalValue *GV, const unsigned char OpFlags) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	// GOTPCREL references must always use RIP.
	if (OpFlags == X86II::MO_GOTPCREL)
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
	SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const {
	// Unpack the global address or external symbol.
	const SDLoc &dl = SDLoc(Op);
	const GlobalValue *GV = nullptr;
	int64_t Offset = 0;
	const char *ExternalSym = nullptr;
	if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
	GV = G->getGlobal();
	Offset = G->getOffset();
	} else {
	const auto *ES = cast<ExternalSymbolSDNode>(Op);
	ExternalSym = ES->getSymbol();
	}

	// Calculate some flags for address lowering.
	const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags;
	if (ForCall)
	OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
	else
	OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
	bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
	bool NeedsLoad = isGlobalStubReference(OpFlags);

	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;

	if (GV) {
	// Create a target global address if this is a global. If possible, fold the
	// offset into the global address reference. Otherwise, ADD it on later.
	int64_t GlobalOffset = 0;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	std::swap(GlobalOffset, Offset);
	}
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
	} else {
	// If this is not a global address, this must be an external symbol.
	Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
	}

	// If this is a direct call, avoid the wrapper if we don't need to do any
	// loads or adds. This allows SDAG ISel to match direct calls.
	if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
	return Result;

	Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (HasPICReg) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (NeedsLoad)
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isOSWindows()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	/// TODO: Can this be moved to general expansion code?
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
	// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
	DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

	SDValue Hi, Lo;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	} else {
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	}

	return DAG.getMergeValues({ Lo, Hi }, dl);
	}

	static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((Op.getOpcode() == ISD::FSHL \|\| Op.getOpcode() == ISD::FSHR) &&
	"Unexpected funnel shift opcode!");

	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);

	bool IsFSHR = Op.getOpcode() == ISD::FSHR;

	if (VT.isVector()) {
	assert(Subtarget.hasVBMI2() && "Expected VBMI2");

	if (IsFSHR)
	std::swap(Op0, Op1);

	APInt APIntShiftAmt;
	if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
	uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
	return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
	Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	}

	return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
	Op0, Op1, Amt);
	}

	assert((VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Unexpected funnel shift type!");

	// Expand slow SHLD/SHRD cases if we are not optimizing for size.
	bool OptForSize = DAG.shouldOptForSize();
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (IsFSHR)
	std::swap(Op0, Op1);

	// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
	if (VT == MVT::i16)
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
	DAG.getConstant(15, DL, Amt.getValueType()));

	unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
	return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
	}

	// Try to use a packed vector operation to handle i64 on 32-bit targets when
	// AVX512DQ is enabled.
	static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_UINT_TO_FP \|\|
	Op.getOpcode() == ISD::UINT_TO_FP) &&
	"Unexpected opcode!");
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();

	if (!Subtarget.hasDQI() \|\| SrcVT != MVT::i64 \|\| Subtarget.is64Bit() \|\|
	(VT != MVT::f32 && VT != MVT::f64))
	return SDValue();

	// Pack the i64 into a vector, do the operation and extract.

	// Using 256-bit to ensure result is 128-bits for f32 case.
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecVT = MVT::getVectorVT(VT, NumElts);

	SDLoc dl(Op);
	SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
	if (IsStrict) {
	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
	{Op.getOperand(0), InVec});
	SDValue Chain = CvtVec.getValue(1);
	SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Value, Chain}, dl);
	}

	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
	const X86Subtarget &Subtarget) {
	switch (Opcode) {
	case ISD::SINT_TO_FP:
	// TODO: Handle wider types with AVX/AVX512.
	if (!Subtarget.hasSSE2() \|\| FromVT != MVT::v4i32)
	return false;
	// CVTDQ2PS or (V)CVTDQ2PD
	return ToVT == MVT::v4f32 \|\| (Subtarget.hasAVX() && ToVT == MVT::v4f64);

	case ISD::UINT_TO_FP:
	// TODO: Handle wider types and i64 elements.
	if (!Subtarget.hasAVX512() \|\| FromVT != MVT::v4i32)
	return false;
	// VCVTUDQ2PS or VCVTUDQ2PD
	return ToVT == MVT::v4f32 \|\| ToVT == MVT::v4f64;

	default:
	return false;
	}
	}

	/// Given a scalar cast operation that is extracted from a vector, try to
	/// vectorize the cast op followed by extraction. This will avoid an expensive
	/// round-trip between XMM and GPR.
	static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: This could be enhanced to handle smaller integer types by peeking
	// through an extend.
	SDValue Extract = Cast.getOperand(0);
	MVT DestVT = Cast.getSimpleValueType();
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Extract.getOperand(1)))
	return SDValue();

	// See if we have a 128-bit vector cast op for this type of cast.
	SDValue VecOp = Extract.getOperand(0);
	MVT FromVT = VecOp.getSimpleValueType();
	unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
	MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
	MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
	if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
	return SDValue();

	// If we are extracting from a non-zero element, first shuffle the source
	// vector to allow extracting from element zero.
	SDLoc DL(Cast);
	if (!isNullConstant(Extract.getOperand(1))) {
	SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
	Mask[0] = Extract.getConstantOperandVal(1);
	VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
	}
	// If the source vector is wider than 128-bits, extract the low part. Do not
	// create an unnecessarily wide vector cast op.
	if (FromVT != Vec128VT)
	VecOp = extract128BitVector(VecOp, 0, DAG, DL);

	// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
	// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
	SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
	DAG.getIntPtrConstant(0, DL));
	}

	static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	bool IsStrict = Op->isStrictFPOpcode();
	MVT VT = Op->getSimpleValueType(0);
	SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

	if (Subtarget.hasDQI()) {
	assert(!Subtarget.hasVLX() && "Unexpected features");

	assert((Src.getSimpleValueType() == MVT::v2i64 \|\|
	Src.getSimpleValueType() == MVT::v4i64) &&
	"Unsupported custom type");

	// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
	assert((VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64) &&
	"Unexpected VT!");
	MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
	: DAG.getUNDEF(MVT::v8i64);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, DL);
	return Res;
	}

	bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP \|\|
	Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
	if (VT != MVT::v4f32 \|\| IsSigned)
	return SDValue();

	SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
	SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
	SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
	DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
	DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
	SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
	SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
	SmallVector<SDValue, 4> SignCvts(4);
	SmallVector<SDValue, 4> Chains(4);
	for (int i = 0; i != 4; ++i) {
	SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
	DAG.getIntPtrConstant(i, DL));
	if (IsStrict) {
	SignCvts[i] =
	DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
	{Op.getOperand(0), Src});
	Chains[i] = SignCvts[i].getValue(1);
	} else {
	SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
	}
	}
	SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

	SDValue Slow, Chain;
	if (IsStrict) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
	{Chain, SignCvt, SignCvt});
	Chain = Slow.getValue(1);
	} else {
	Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
	}

	IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
	SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

	if (IsStrict)
	return DAG.getMergeValues({Cvt, Chain}, DL);

	return Cvt;
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	// Note: Since v2f64 is a legal type. We don't need to zero extend the
	// source for strict FP.
	if (IsStrict)
	return DAG.getNode(
	X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
	{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT))});
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT == MVT::v2i64 \|\| SrcVT == MVT::v4i64)
	return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && UseSSEReg)
	return Op;
	if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
	return Op;

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	// SSE doesn't have an i16 conversion so we need to promote.
	if (SrcVT == MVT::i16 && (UseSSEReg \|\| VT == MVT::f128)) {
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{Chain, Ext});

	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
	}

	if (VT == MVT::f128)
	return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

	SDValue ValueToStore = Src;
	if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Chain = DAG.getStore(
	Chain, dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);

	if (IsStrict)
	return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

	return Tmp.first;
	}

	std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits() / 8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *LoadMMO;
	if (FI) {
	int SSFI = FI->getIndex();
	LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue FILDOps[] = {Chain, StackSlot};
	SDValue Result =
	DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
	Tys, FILDOps, SrcVT, LoadMMO);
	Chain = Result.getValue(1);

	if (useSSE) {
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is glued to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits() / 8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
	MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
	Op.getValueType(), StoreMMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	Chain = Result.getValue(1);
	}

	return { Result, Chain };
	}

	/// Horizontal vector math instructions may be slower than normal math with
	/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
	/// implementation, and likely shuffle complexity of the alternate sequence.
	static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsOptimizingSize = DAG.shouldOptForSize();
	bool HasFastHOps = Subtarget.hasFastHorizontalOps();
	return !IsSingleSource \|\| IsOptimizingSize \|\| HasFastHOps;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	SDValue Sub;
	SDValue Chain;
	// TODO: Are there any fast-math-flags to propagate here?
	if (IsStrict) {
	Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), XR2F, CLod1});
	Chain = Sub.getValue(1);
	} else
	Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (!IsStrict && Subtarget.hasSSE3() &&
	shouldUseHorizontalOp(true, DAG, Subtarget)) {
	// FIXME: Do we need a STRICT version of FHADD?
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
	if (IsStrict) {
	Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
	{Chain, Shuffle, Sub});
	Chain = Result.getValue(1);
	} else
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
	}
	Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Result, Chain}, dl);

	return Result;
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Load),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	if (Op.getNode()->isStrictFPOpcode()) {
	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Chain = Op.getOperand(0);
	SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
	{Chain, Or, Bias});

	if (Op.getValueType() == Sub.getValueType())
	return Sub;

	// Handle final rounding.
	std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
	Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

	return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
	}

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	bool IsStrict = Op->isStrictFPOpcode();

	SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	// Let generic type legalization widen this.
	if (!IsStrict)
	return SDValue();
	// Otherwise pad the integer input with 0s and widen the operation.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getConstant(0, DL, MVT::v2i32));
	SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
	{Op.getOperand(0), N0});
	SDValue Chain = Res.getValue(1);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getMergeValues({Res, Chain}, DL);
	}

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), N0});
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
	}

	// Zero extend to 2i64, OR with the floating point representation of 2^52.
	// This gives us the floating point equivalent of 2^52 + the i32 integer
	// since double has 52-bits of mantissa. Then subtract 2^52 in floating
	// point leaving just our i32 integers in double format.
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);

	if (IsStrict)
	return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), Or, VBias});
	return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue V = Op->getOperand(IsStrict ? 1 : 0);
	MVT VecIntVT = V.getSimpleValueType();
	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	if (Subtarget.hasAVX512()) {
	// With AVX512, but not VLX we need to widen to get a 512-bit result type.
	assert(!Subtarget.hasVLX() && "Unexpected features");
	MVT VT = Op->getSimpleValueType(0);

	// v8i32->v8f64 is legal with AVX512 so just return it.
	if (VT == MVT::v8f64)
	return Op;

	assert((VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v4f64) &&
	"Unexpected VT!");
	MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
	MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	SDValue Tmp =
	IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
	V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
	{Op->getOperand(0), V});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, DL);
	return Res;
	}

	if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
	Op->getSimpleValueType(0) == MVT::v4f64) {
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
	Constant *Bias = ConstantFP::get(
	*DAG.getContext(),
	APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /Alignment/ 8);
	SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
	SDValue VBias = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/Alignment/ 8, MachineMemOperand::MOLoad);

	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
	DAG.getBitcast(MVT::v4i64, VBias));
	Or = DAG.getBitcast(MVT::v4f64, Or);

	if (IsStrict)
	return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
	{Op.getOperand(0), Or, VBias});
	return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
	}

	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for (0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFSub = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// NOTE: By using fsub of a positive constant instead of fadd of a negative
	// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
	// enabled. See PR24512.
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	// (float4) lo;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	// return (float4) lo + fhi;
	if (IsStrict) {
	SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
	{Op.getOperand(0), HighBitcast, VecCstFSub});
	return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
	{FHigh.getValue(1), LowBitcast, FHigh});
	}

	SDValue FHigh =
	DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
	SDValue N0 = Op.getOperand(OpNo);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	case MVT::v2i64:
	case MVT::v4i64:
	return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op->getSimpleValueType(0);
	SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

	if (DstVT == MVT::f128)
	return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));

	if (DstVT.isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	// Promote i32 to i64 and use a signed conversion on 64-bit targets.
	if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
	Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
	{Chain, Src});
	return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
	}

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 =
	DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	std::pair<SDValue, SDValue> Tmp =
	BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	if (IsStrict)
	return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

	return Tmp.first;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Src;
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	}
	SDValue Store =
	DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);
	Chain = Fild.getValue(1);


	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
	APInt FF(64, 0x5F80000000000000ULL);
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	Chain = Fudge.getValue(1);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	if (IsStrict) {
	SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
	{Chain, Fild, Fudge});
	// STRICT_FP_ROUND can't handle equal types.
	if (DstVT == MVT::f80)
	return Add;
	return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
	{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
	}
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an SDValue().
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence and return the
	// result.
	SDValue
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, SDValue &Chain) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
	EVT TheVT = Value.getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i32. PR44019
	if (!IsSigned && DstTy != MVT::i64) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getStoreSize();
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust = (Value < Thresh) ? 0 : 0x80000000;
	// FltOfs = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value - FltOfs);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT);
	SDValue Cmp;
	if (IsStrict) {
	Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
	Chain, /IsSignaling/ true);
	Chain = Cmp.getValue(1);
	} else {
	Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
	}

	Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
	DAG.getConstant(0, DL, MVT::i64),
	DAG.getConstant(APInt::getSignMask(64),
	DL, MVT::i64));
	SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
	DAG.getConstantFP(0.0, DL, TheVT),
	ThreshVal);

	if (IsStrict) {
	Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
	{ Chain, Value, FltOfs });
	Chain = Value.getValue(1);
	} else
	Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
	}

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
	SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
	SDValue Ops[] = { Chain, StackSlot };

	unsigned FLDSize = TheVT.getStoreSize();
	assert(FLDSize <= MemSize && "Stack slot not big enough");
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
	Chain = Value.getValue(1);
	}

	// Build the FP_TO_INT*_IN_MEM
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOStore, MemSize, MemSize);
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
	DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);

	SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
	Chain = Res.getValue(1);

	// If we need an unsigned fixup, XOR the result with adjust.
	if (UnsignedFixup)
	Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

	return Res;
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert((Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ZERO_EXTEND) &&
	"Unexpected extension opcode");
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

	// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
	if (InVT == MVT::v8i8) {
	if (VT != MVT::v8i64)
	return SDValue();

	In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
	MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
	return DAG.getNode(ExtendInVecOpc, dl, VT, In);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

	// Short-circuit if we can determine that each 128-bit half is the same value.
	// Otherwise, this is difficult to match and optimize.
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
	if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

	SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Opc == ISD::ZERO_EXTEND;
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	OpHi = DAG.getBitcast(HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
	static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert((VT == MVT::v16i8 \|\| VT == MVT::v16i16) && "Unexpected VT.");
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(8, dl));
	Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
	Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
	// avoids a constant pool load.
	if (VT.getVectorElementType() != MVT::i8) {
	SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
	return DAG.getNode(ISD::SRL, DL, VT, Extend,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	}

	// Extend VT if BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI()) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = DAG.getConstant(0, DL, WideVT);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(MVT::i8, NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");
	assert(DstVT.isVector() && "VT not a vector?");

	// Requires SSE2 but AVX512 has fast vector truncate.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 64bits or greater from a
	// 128bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 64) != 0 \|\| (SrcSizeInBits % 128) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	if (!isPowerOf2_32(NumElems))
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (SrcVT.getScalarSizeInBits() > 16 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
	if (SrcVT.is128BitVector()) {
	InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
	In = DAG.getBitcast(InVT, In);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
	Res = extractSubVector(Res, 0, DAG, DL, 64);
	return DAG.getBitcast(DstVT, Res);
	}

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
	SmallVector<int, 64> Mask;
	int Scale = 64 / OutVT.getScalarSizeInBits();
	scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
	Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
	In, ISD::SETGT);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	assert((NumElts == 8 \|\| NumElts == 16) && "Unexpected number of elements");
	// We need to change to a wider element type that we have support for.
	// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
	// For 16 element vectors we extend to v16i32 unless we are explicitly
	// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
	// we need to split into two 8 element vectors which we can extend to v8i32,
	// truncate and concat the results. There's an additional complication if
	// the original type is v16i8. In that case we can't split the v16i8 so
	// first we pre-extend it to v16i16 which we can split to v8i16, then extend
	// to v8i32, truncate that to v8i1 and concat the two halves.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
	if (InVT == MVT::v16i8) {
	// First we need to sign extend up to 256-bits so we can split that.
	InVT = MVT::v16i16;
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
	}
	SDValue Lo = extract128BitVector(In, 0, DAG, DL);
	SDValue Hi = extract128BitVector(In, 8, DAG, DL);
	// We're split now, just emit two truncates and a concat. The two
	// truncates will trigger legalization to come back to this function.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}
	// We either have 8 elements or we're allowed to use 512-bit vectors.
	// If we have VLX, we want to use the narrowest vector that can get the
	// job done so we use vXi32.
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
	if (Subtarget.hasDQI())
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
	return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	// If we're called by the type legalizer, handle a few cases.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(InVT)) {
	if ((InVT == MVT::v8i64 \|\| InVT == MVT::v16i32 \|\| InVT == MVT::v16i64) &&
	VT.is128BitVector()) {
	assert(Subtarget.hasVLX() && "Unexpected subtarget!");
	// The default behavior is to truncate one step, concatenate, and then
	// truncate the remainder. We'd rather produce two 64-bit results and
	// concatenate those.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

	Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	// Otherwise let default legalization handle it.
	return SDValue();
	}

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI. Otherwise we have to promoted to v16i32
	// and then truncate that. But we should only do that if we haven't been
	// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
	// handled by isel patterns.
	if (InVT != MVT::v16i16 \|\| Subtarget.hasBWI() \|\|
	Subtarget.canExtendTo512DQ())
	return Op;
	}

	unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known = DAG.computeKnownBits(In);
	if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	// Handle truncation of V256 to V128 using shuffles.
	assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
	// Use an AND to zero uppper bits for PACKUS.
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

	SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(8, DL));
	return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
	}

	llvm_unreachable("All 256->128 cases should have been handled above!");
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT \|\|
	Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
	MVT VT = Op->getSimpleValueType(0);
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	MVT SrcVT = Src.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.isVector()) {
	if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc;
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

	if (!IsSigned && !Subtarget.hasVLX()) {
	assert(Subtarget.useAVX512Regs() && "Unexpected features!");
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = Op.getOpcode();
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
	: DAG.getUNDEF(MVT::v8f64);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res, Chain;
	if (IsStrict) {
	Res =
	DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Opc, dl, ResVT, Src);
	}

	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
	if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
	assert(!IsSigned && "Expected unsigned conversion!");
	assert(Subtarget.useAVX512Regs() && "Requires avx512f");
	return Op;
	}

	// Widen vXi32 fp_to_uint with avx512f to 512-bit source.
	if ((VT == MVT::v4i32 \|\| VT == MVT::v8i32) &&
	(SrcVT == MVT::v4f64 \|\| SrcVT == MVT::v4f32 \|\| SrcVT == MVT::v8f32)) {
	assert(!IsSigned && "Expected unsigned conversion!");
	assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
	"Unexpected features!");
	MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
	MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp =
	IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) &&
	(SrcVT == MVT::v2f64 \|\| SrcVT == MVT::v4f64 \|\| SrcVT == MVT::v4f32)) {
	assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
	!Subtarget.hasVLX() && "Unexpected features!");
	MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp =
	IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
	SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	if (IsStrict) {
	unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
	: X86ISD::STRICT_CVTTP2UI;
	return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
	}
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	return DAG.getNode(Opc, dl, VT, Tmp);
	}

	return SDValue();
	}

	assert(!VT.isVector());

	bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

	if (!IsSigned && UseSSEReg) {
	// Conversions from f32/f64 with AVX512 should be legal.
	if (Subtarget.hasAVX512())
	return Op;

	// Use default expansion for i64.
	if (VT == MVT::i64)
	return SDValue();

	assert(VT == MVT::i32 && "Unexpected VT!");

	// Promote i32 to i64 and use a signed operation on 64-bit targets.
	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i32. PR44019
	if (Subtarget.is64Bit()) {
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
	{ Op.getOperand(0), Src });
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	if (IsStrict)
	return DAG.getMergeValues({ Res, Chain }, dl);
	return Res;
	}

	// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
	// use fisttp which will be handled later.
	if (!Subtarget.hasSSE3())
	return SDValue();
	}

	// Promote i16 to i32 if we can use a SSE operation or the type is f128.
	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i16. PR44019
	if (VT == MVT::i16 && (UseSSEReg \|\| SrcVT == MVT::f128)) {
	assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
	{ Op.getOperand(0), Src });
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	if (IsStrict)
	return DAG.getMergeValues({ Res, Chain }, dl);
	return Res;
	}

	// If this is a FP_TO_SINT using SSEReg we're done.
	if (UseSSEReg && IsSigned)
	return Op;

	// fp128 needs to use a libcall.
	if (SrcVT == MVT::f128) {
	RTLIB::Libcall LC;
	if (IsSigned)
	LC = RTLIB::getFPTOSINT(SrcVT, VT);
	else
	LC = RTLIB::getFPTOUINT(SrcVT, VT);

	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
	SDLoc(Op), Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	// Fall back to X87.
	SDValue Chain;
	if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
	if (IsStrict)
	return DAG.getMergeValues({V, Chain}, dl);
	return V;
	}

	llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
	}

	SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(IsStrict ? 1 : 0);
	MVT SVT = In.getSimpleValueType();

	if (VT == MVT::f128) {
	RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
	return LowerF128Call(Op, DAG, LC);
	}

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	SDValue Res =
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
	{Op->getOperand(0), Res});
	return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
	}

	SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();

	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(IsStrict ? 1 : 0);
	MVT SVT = In.getSimpleValueType();

	// It's legal except when f128 is involved
	if (SVT != MVT::f128)
	return Op;

	RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.

	SDLoc dl(Op);
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
	dl, Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If both operands have other uses, this is probably not profitable.
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (!LHS.hasOneUse() && !RHS.hasOneUse())
	return Op;

	// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
	bool IsFP = Op.getSimpleValueType().isFloatingPoint();
	if (IsFP && !Subtarget.hasSSE3())
	return Op;
	if (!IsFP && !Subtarget.hasSSSE3())
	return Op;

	// Extract from a common vector.
	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	LHS.getOperand(0) != RHS.getOperand(0) \|\|
	!isa<ConstantSDNode>(LHS.getOperand(1)) \|\|
	!isa<ConstantSDNode>(RHS.getOperand(1)) \|\|
	!shouldUseHorizontalOp(true, DAG, Subtarget))
	return Op;

	// Allow commuted 'hadd' ops.
	// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
	unsigned HOpcode;
	switch (Op.getOpcode()) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default:
	llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
	}
	unsigned LExtIndex = LHS.getConstantOperandVal(1);
	unsigned RExtIndex = RHS.getConstantOperandVal(1);
	if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
	(HOpcode == X86ISD::HADD \|\| HOpcode == X86ISD::FHADD))
	std::swap(LExtIndex, RExtIndex);

	if ((LExtIndex & 1) != 0 \|\| RExtIndex != (LExtIndex + 1))
	return Op;

	SDValue X = LHS.getOperand(0);
	EVT VecVT = X.getValueType();
	unsigned BitWidth = VecVT.getSizeInBits();
	unsigned NumLanes = BitWidth / 128;
	unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
	assert((BitWidth == 128 \|\| BitWidth == 256 \|\| BitWidth == 512) &&
	"Not expecting illegal vector widths here");

	// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
	// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
	SDLoc DL(Op);
	if (BitWidth == 256 \|\| BitWidth == 512) {
	unsigned LaneIdx = LExtIndex / NumEltsPerLane;
	X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
	LExtIndex %= NumEltsPerLane;
	}

	// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
	// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
	SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
	DAG.getIntPtrConstant(LExtIndex / 2, DL));
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
	assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&
	"Only expecting float/double");
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFABSorFNEG");

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	unsigned EltBits = VT.getScalarSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
	APInt::getSignMask(EltBits);
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp = IsFABS ? X86ISD::FAND :
	IsFNABS ? X86ISD::FOR :
	X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
	/// style scalarized (associative) reduction patterns.
	static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
	SmallVectorImpl<SDValue> &SrcOps) {
	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, APInt> SrcOpMap;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	assert(Op.getOpcode() == unsigned(BinOp) &&
	"Unexpected bit reduction opcode");
	Opnds.push_back(Op.getOperand(0));
	Opnds.push_back(Op.getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all BinOp operands.
	if (I->getOpcode() == unsigned(BinOp)) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return false;

	SDValue Src = I->getOperand(0);
	DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
	if (M == SrcOpMap.end()) {
	VT = Src.getValueType();
	// Quit if not the same type.
	if (SrcOpMap.begin() != SrcOpMap.end() &&
	VT != SrcOpMap.begin()->first.getValueType())
	return false;
	unsigned NumElts = VT.getVectorNumElements();
	APInt EltCount = APInt::getNullValue(NumElts);
	M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
	SrcOps.push_back(Src);
	}
	// Quit if element already used.
	unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (M->second[CIdx])
	return false;
	M->second.setBit(CIdx);
	}

	// Quit if not all elements are used.
	for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
	E = SrcOpMap.end();
	I != E; ++I) {
	if (!I->second.isAllOnesValue())
	return false;
	}

	return true;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &X86CC) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41() \|\| !Op->hasOneUse())
	return SDValue();

	SmallVector<SDValue, 8> VecIns;
	if (!matchScalarReduction(Op, ISD::OR, VecIns))
	return SDValue();

	// Quit if not 128/256-bit vector.
	EVT VT = VecIns[0].getValueType();
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	SDLoc DL(Op);
	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
	DL, MVT::i8);
	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Transform to an x86-specific ALU node with flags if there is a chance of
	// using an RMW op or only the flags are used. Otherwise, leave
	// the node alone and emit a 'cmp' or 'test' instruction.
	static bool isProfitableToUseFlagOp(SDValue Op) {
	for (SDNode *U : Op->uses())
	if (U->getOpcode() != ISD::CopyToReg &&
	U->getOpcode() != ISD::SETCC &&
	U->getOpcode() != ISD::STORE)
	return false;

	return true;
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	SDValue ArithOp = Op;

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better.
	if (!hasNonFlagsUse(Op))
	break;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	if (!isProfitableToUseFlagOp(Op))
	break;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::ADD: Opcode = X86ISD::ADD; break;
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: Opcode = X86ISD::OR; break;
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	case ISD::SSUBO:
	case ISD::USUBO: {
	// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
	Op->getOperand(1)).getValue(1);
	}
	default:
	break;
	}

	if (Opcode == 0) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
	unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue Chain, bool IsSignaling) {
	if (isNullConstant(Op1))
	return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);

	EVT CmpVT = Op0.getValueType();

	if (CmpVT.isFloatingPoint()) {
	if (Chain) {
	SDValue Res =
	DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
	dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
	return std::make_pair(Res, Res.getValue(1));
	}
	return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),
	SDValue());
	}

	assert((CmpVT == MVT::i8 \|\| CmpVT == MVT::i16 \|\|
	CmpVT == MVT::i32 \|\| CmpVT == MVT::i64) && "Unexpected VT!");

	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
	!DAG.getMachineFunction().getFunction().hasMinSize()) {
	ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
	ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
	// Don't do this if the immediate can fit in 8-bits.
	if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) \|\|
	(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
	unsigned ExtendOp =
	isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	if (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE) {
	// For equality comparisons try to use SIGN_EXTEND if the input was
	// truncate from something with enough sign bits.
	if (Op0.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op0.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	} else if (Op1.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op1.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	}
	}

	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
	}
	}

	// Try to shrink i64 compares if the input has enough zero bits.
	// FIXME: Do this for non-constant compares for constant on LHS?
	if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
	Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
	cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
	DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
	}

	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return std::make_pair(Sub.getValue(1), SDValue());
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;
	bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP \|\|
	Cmp.getOpcode() == X86ISD::STRICT_FCMPS;

	if (Subtarget.hasCMov() \|\| (!IsCmp && !IsStrictCmp) \|\|
	!Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	// There is no FSQRT for 512-bits, but there is RSQRT14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	// There is no FSQRT for 512-bits, but there is RCP14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	SDValue
	X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	assert((Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()) &&
	"Unexpected divisor!");

	// Only perform this transform if CMOV is supported otherwise the select
	// below will become a branch.
	if (!Subtarget.hasCMov())
	return SDValue();

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	// FIXME: Support i8.
	if (VT != MVT::i16 && VT != MVT::i32 &&
	!(Subtarget.is64Bit() && VT == MVT::i64))
	return SDValue();

	unsigned Lg2 = Divisor.countTrailingZeros();

	// If the divisor is 2 or -2, the default expansion is better.
	if (Lg2 == 1)
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
	SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

	// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
	SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CMov.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	/// Returns the BT node and the condition code needed to use it.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	SDValue &X86CC) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue Src, BitNo;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known = DAG.computeKnownBits(Op0);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	Src = Op1;
	BitNo = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	Src = AndLHS.getOperand(0);
	BitNo = AndLHS.getOperand(1);
	} else {
	// Use BT if the immediate can't be encoded in a TEST instruction or we
	// are optimizing for size and the immedaite won't fit in a byte.
	bool OptForSize = DAG.shouldOptForSize();
	if ((!isUInt<32>(AndRHSVal) \|\| (OptForSize && !isUInt<8>(AndRHSVal))) &&
	isPowerOf2_64(AndRHSVal)) {
	Src = AndLHS;
	BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
	Src.getValueType());
	}
	}
	}

	// No patterns found, give up.
	if (!Src.getNode())
	return SDValue();

	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
	dl, MVT::i8);
	return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1, bool &IsAlwaysSignaling) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	switch (SetCCOpcode) {
	default:
	IsAlwaysSignaling = true;
	break;
	case ISD::SETEQ:
	case ISD::SETOEQ:
	case ISD::SETUEQ:
	case ISD::SETNE:
	case ISD::SETONE:
	case ISD::SETUNE:
	case ISD::SETO:
	case ISD::SETUO:
	IsAlwaysSignaling = false;
	break;
	}

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

	// Prefer SETGT over SETLT.
	if (SetCCOpcode == ISD::SETLT) {
	SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
	std::swap(Op0, Op1);
	}

	return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
	}

	/// Given a buildvector constant, return a new vector constant with each element
	/// incremented or decremented. If incrementing or decrementing would result in
	/// unsigned overflow or underflow or this is not a simple vector constant,
	/// return an empty value.
	static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
	auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
	if (!BV)
	return SDValue();

	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 8> NewVecC;
	SDLoc DL(V);
	for (unsigned i = 0; i < NumElts; ++i) {
	auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EltVT)
	return SDValue();

	// Avoid overflow/underflow.
	const APInt &EltC = Elt->getAPIntValue();
	if ((IsInc && EltC.isMaxValue()) \|\| (!IsInc && EltC.isNullValue()))
	return SDValue();

	NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
	}

	return DAG.getBuildVector(VT, DL, NewVecC);
	}

	/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	/// Op0 u<= Op1:
	/// t = psubus Op0, Op1
	/// pcmpeq t, <0..0>
	static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
	ISD::CondCode Cond, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	MVT VET = VT.getVectorElementType();
	if (VET != MVT::i8 && VET != MVT::i16)
	return SDValue();

	switch (Cond) {
	default:
	return SDValue();
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	return SDValue();
	SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /IsInc/false);
	if (!ULEOp1)
	return SDValue();
	Op1 = ULEOp1;
	break;
	}
	case ISD::SETUGT: {
	// If the comparison is against a constant, we can turn this into a setuge.
	// This is beneficial because materializing a constant 0 for the PCMPEQ is
	// probably cheaper than XOR+PCMPGT using 2 different vector constants:
	// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
	SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /IsInc/true);
	if (!UGEOp1)
	return SDValue();
	Op1 = Op0;
	Op0 = UGEOp1;
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE:
	std::swap(Op0, Op1);
	break;
	case ISD::SETULE:
	break;
	}

	SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
	return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	DAG.getConstant(0, dl, VT));
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC \|\|
	Op.getOpcode() == ISD::STRICT_FSETCCS;
	SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
	SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
	SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
	MVT VT = Op->getSimpleValueType(0);
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op1.getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
	} else {
	Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	SDValue Cmp;
	bool IsAlwaysSignaling;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
	if (!Subtarget.hasAVX()) {
	// TODO: We could use following steps to handle a quiet compare with
	// signaling encodings.
	// 1. Get ordered masks from a quiet ISD::SETO
	// 2. Use the masks to mask potential unordered elements in operand A, B
	// 3. Get the compare results of masked A, B
	// 4. Calculating final result using the mask and result from 3
	// But currently, we just fall back to scalar operations.
	if (IsStrict && IsAlwaysSignaling && !IsSignaling)
	return SDValue();

	// Insert an extra signaling instruction to raise exception.
	if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
	SDValue SignalCmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
	// FIXME: It seems we need to update the flags of all new strict nodes.
	// Otherwise, mayRaiseFPException in MI will return false due to
	// NoFPExcept = false by default. However, I didn't find it in other
	// patches.
	SignalCmp->setFlags(Op->getFlags());
	Chain = SignalCmp.getValue(1);
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	if (SSECC >= 8) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0, Cmp1;
	if (IsStrict) {
	Cmp0 = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
	Cmp1 = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
	Cmp1.getValue(1));
	} else {
	Cmp0 = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
	Cmp1 = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
	}
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	if (IsStrict) {
	Cmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
	Chain = Cmp.getValue(1);
	} else
	Cmp = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
	}
	} else {
	// Handle all other FP comparisons here.
	if (IsStrict) {
	// Make a flip on already signaling CCs before setting bit 4 of AVX CC.
	SSECC \|= (IsAlwaysSignaling ^ IsSignaling) << 4;
	Cmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
	Chain = Cmp.getValue(1);
	} else
	Cmp = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	if (IsStrict)
	return DAG.getMergeValues({Cmp, Chain}, dl);

	return Cmp;
	}

	assert(!IsStrict && "Strict SETCC only handles FP operands.");

	MVT VTOp0 = Op0.getSimpleValueType();
	(void)VTOp0;
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	assert((VTOp0.getScalarSizeInBits() >= 32 \|\| Subtarget.hasBWI()) &&
	"Unexpected operand type");
	return LowerIntVSETCC_AVX512(Op, DAG);
	}

	// Lower using XOP integer comparisons.
	if (VT.is128BitVector() && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getTargetConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
	if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
	Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
	ConstantSDNode *C1 = isConstOrConstSplat(Op1);
	if (C1 && C1->getAPIntValue().isPowerOf2()) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

	SDValue Result = Op0.getOperand(0);
	Result = DAG.getNode(ISD::SHL, dl, VT, Result,
	DAG.getConstant(ShiftAmt, dl, VT));
	Result = DAG.getNode(ISD::SRA, dl, VT, Result,
	DAG.getConstant(BitWidth - 1, dl, VT));
	return Result;
	}
	}

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// If this is a SETNE against the signed minimum value, change it to SETGT.
	// If this is a SETNE against the signed maximum value, change it to SETLT.
	// which will be swapped to SETGT.
	// Otherwise we use PCMPEQ+invert.
	APInt ConstValue;
	if (Cond == ISD::SETNE &&
	ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
	if (ConstValue.isMinSignedValue())
	Cond = ISD::SETGT;
	else if (ConstValue.isMaxSignedValue())
	Cond = ISD::SETLT;
	}

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for unsigned compares.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (ISD::isUnsignedIntSetCC(Cond) &&
	(FlipSigns \|\| ISD::isTrueWhenEqual(Cond)) &&
	TLI.isOperationLegal(ISD::UMIN, VT)) {
	// If we have a constant operand, increment/decrement it and change the
	// condition to avoid an invert.
	if (Cond == ISD::SETUGT) {
	// X > C --> X >= (C+1) --> X == umax(X, C+1)
	if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /IsInc/true)) {
	Op1 = UGTOp1;
	Cond = ISD::SETUGE;
	}
	}
	if (Cond == ISD::SETULT) {
	// X < C --> X <= (C-1) --> X == umin(X, C-1)
	if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /IsInc/false)) {
	Op1 = ULTOp1;
	Cond = ISD::SETULE;
	}
	}
	bool Invert = false;
	unsigned Opc;
	switch (Cond) {
	default: llvm_unreachable("Unexpected condition code");
	case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETULE: Opc = ISD::UMIN; break;
	case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: Opc = ISD::UMAX; break;
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to use SUBUS and PCMPEQ.
	if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
	return V;

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
	// the odd elements over the even elements.
	if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
	Op0 = DAG.getConstant(0, dl, MVT::v4i32);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	static const int MaskHi[] = { 1, 1, 3, 3 };
	SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	return DAG.getBitcast(VT, Result);
	}

	if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	static const int MaskHi[] = { 1, 1, 3, 3 };
	SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	return DAG.getBitcast(VT, Result);
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
	} else {
	SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

	// Cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
	static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue &X86CC) {
	// Only support equality comparisons.
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return SDValue();

	// Must be a bitcast from vXi1.
	if (Op0.getOpcode() != ISD::BITCAST)
	return SDValue();

	Op0 = Op0.getOperand(0);
	MVT VT = Op0.getSimpleValueType();
	if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
	!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
	!(Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1)))
	return SDValue();

	X86::CondCode X86Cond;
	if (isNullConstant(Op1)) {
	X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	} else if (isAllOnesConstant(Op1)) {
	// C flag is set for all ones.
	X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
	} else
	return SDValue();

	// If the input is an AND, we can combine it's operands into the KTEST.
	bool KTestable = false;
	if (Subtarget.hasDQI() && (VT == MVT::v8i1 \|\| VT == MVT::v16i1))
	KTestable = true;
	if (Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1))
	KTestable = true;
	if (!isNullConstant(Op1))
	KTestable = false;
	if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
	SDValue LHS = Op0.getOperand(0);
	SDValue RHS = Op0.getOperand(1);
	X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
	}

	// If the input is an OR, we can combine it's operands into the KORTEST.
	SDValue LHS = Op0;
	SDValue RHS = Op0;
	if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
	LHS = Op0.getOperand(0);
	RHS = Op0.getOperand(1);
	}

	X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	}

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
	ISD::CondCode CC, const SDLoc &dl,
	SelectionDAG &DAG, SDValue &X86CC,
	SDValue &Chain,
	bool IsSignaling) const {
	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
	return BT;
	}

	// Try to use PTEST for a tree ORs equality compared with 0.
	// TODO: We could do AND tree with all 1s as well by using the C flag.
	if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
	return PTEST;
	}

	// Try to lower using KORTEST or KTEST.
	if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
	return Test;

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

	X86CC = Op0.getOperand(0);
	if (Invert) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	}

	return Op0.getOperand(1);
	}
	}

	// Try to use the carry flag from the add in place of an separate CMP for:
	// (seteq (add X, -1), -1). Similar for setne.
	if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
	Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (isProfitableToUseFlagOp(Op0)) {
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

	SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
	Op0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
	X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	return SDValue(New.getNode(), 1);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (CondCode == X86::COND_INVALID)
	return SDValue();

	std::pair<SDValue, SDValue> Tmp =
	EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);
	SDValue EFLAGS = Tmp.first;
	if (Chain)
	Chain = Tmp.second;
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
	return EFLAGS;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC \|\|
	Op.getOpcode() == ISD::STRICT_FSETCCS;
	MVT VT = Op->getSimpleValueType(0);

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
	SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
	SDLoc dl(Op);
	ISD::CondCode CC =
	cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets handled by emitFlagsForSetcc.
	if (Op0.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
	Op.getOpcode() == ISD::STRICT_FSETCCS);

	// If softenSetCCOperands returned a scalar, use it.
	if (!Op1.getNode()) {
	assert(Op0.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	if (IsStrict)
	return DAG.getMergeValues({Op0, Chain}, dl);
	return Op0;
	}
	}

	SDValue X86CC;
	SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,
	Op.getOpcode() == ISD::STRICT_FSETCCS);
	if (!EFLAGS)
	return SDValue();

	SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);

	return Res;
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	// This function returns three things: the arithmetic computation itself
	// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
	// flag and the condition code define the case in which the arithmetic
	// computation overflows.
	static std::pair<SDValue, SDValue>
	getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
	assert(Op.getResNo() == 0 && "Unexpected result number!");
	SDValue Value, Overflow;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned BaseOp = 0;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
	break;
	case ISD::SSUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO:
	BaseOp = X86ISD::UMUL;
	Cond = X86::COND_O;
	break;
	}

	if (BaseOp) {
	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}

	return std::make_pair(Value, Overflow);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDLoc DL(Op);
	X86::CondCode Cond;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

	SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
	assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::OR \|\| Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	bool IsAlwaysSignaling;
	unsigned SSECC =
	translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
	CondOp0, CondOp1, IsAlwaysSignaling);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
	DAG.getTargetConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getTargetConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a +0.0 constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
	!isNullFPConstant(Op2)) {
	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.
	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	// For v64i1 without 64-bit support we need to split and rejoin.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	assert(Subtarget.hasBWI() && "Expected BWI to be legal");
	SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
	SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
	SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
	SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
	SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
	SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode = Cond.getConstantOperandVal(0);

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Zero = DAG.getConstant(0, DL, Op.getValueType());
	return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Cmp.getOpcode() == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res =
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// Or finally, promote i8 cmovs if we have CMOV,
	// or i16 cmovs if it won't prevent folding a load.
	// FIXME: we should not limit promotion of i8 case to only when the CMOV is
	// legal, but EmitLoweredSelect() can not deal with these extensions
	// being inserted between two CMOV's. (in i16 case too TBN)
	// https://bugs.llvm.org/show_bug.cgi?id=40974
	if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) \|\|
	(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
	!MayFoldLoad(Op2))) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is i8/i16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
	} else {
	SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
	SDValue Zero = DAG.getConstant(0, dl, WideVT);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasAVX()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (InVT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * NumElts;
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	InVT = In.getSimpleValueType();
	}

	// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");

	if (InVT.getVectorNumElements() != NumElts)
	return DAG.getNode(Op.getOpcode(), dl, VT, In);

	// FIXME: Apparently we create inreg operations that could be regular
	// extends.
	unsigned ExtOpc =
	Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
	if (Subtarget.hasAVX()) {
	assert(VT.is256BitVector() && "256-bit vector expected");
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	int HalfNumElts = HalfVT.getVectorNumElements();

	unsigned NumSrcElts = InVT.getVectorNumElements();
	SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
	for (int i = 0; i != HalfNumElts; ++i)
	HiMask[i] = HalfNumElts + i;

	SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
	Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	// We should only get here for sign extend.
	assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
	assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	SDValue SignExt = Curr;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	if (InVT != MVT::v4i32) {
	MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

	unsigned DestWidth = DestVT.getScalarSizeInBits();
	unsigned Scale = DestWidth / InSVT.getSizeInBits();

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned DestElts = DestVT.getVectorNumElements();

	// Build a shuffle mask that takes each input element and places it in the
	// MSBs of the new element size.
	SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != DestElts; ++i)
	Mask[i * Scale + (Scale - 1)] = i;

	Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
	Curr = DAG.getBitcast(DestVT, Curr);

	unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
	DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
	}

	if (VT == MVT::v2i64) {
	assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
	SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
	SignExt = DAG.getBitcast(VT, SignExt);
	}

	return SignExt;
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
	if (InVT == MVT::v8i8) {
	if (VT != MVT::v8i64)
	return SDValue();

	In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
	MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
	return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

	unsigned NumElems = InVT.getVectorNumElements();
	SmallVector<int,8> ShufMask(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	/// Change a vector store into a pair of half-size vector stores.
	static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert((StoredVal.getValueType().is256BitVector() \|\|
	StoredVal.getValueType().is512BitVector()) &&
	"Expecting 256/512-bit op");

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. Assume the input store is legal (this transform is
	// only used for targets with AVX). Note: It is possible that we have an
	// illegal type like v2i128, and so we could allow splitting a volatile store
	// in that case if that is important.
	if (!Store->isSimple())
	return SDValue();

	EVT StoreVT = StoredVal.getValueType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
	unsigned HalfAlign = (128 == HalfSize ? 16 : 32);

	SDLoc DL(Store);
	SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
	SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
	SDValue Ptr0 = Store->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
	unsigned Alignment = Store->getAlignment();
	SDValue Ch0 =
	DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
	Alignment, Store->getMemOperand()->getFlags());
	SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
	Store->getPointerInfo().getWithOffset(HalfAlign),
	MinAlign(Alignment, HalfAlign),
	Store->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
	}

	/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
	/// type.
	static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
	SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert(StoreVT.is128BitVector() &&
	StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
	StoredVal = DAG.getBitcast(StoreVT, StoredVal);

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (!Store->isSimple())
	return SDValue();

	MVT StoreSVT = StoreVT.getScalarType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned ScalarSize = StoreSVT.getStoreSize();
	unsigned Alignment = Store->getAlignment();

	SDLoc DL(Store);
	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Offset = i * ScalarSize;
	SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
	DAG.getIntPtrConstant(i, DL));
	SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
	Store->getPointerInfo().getWithOffset(Offset),
	MinAlign(Alignment, Offset),
	Store->getMemOperand()->getFlags());
	Stores.push_back(Ch);
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
	}

	static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
	if (StoredVal.getValueType().isVector() &&
	StoredVal.getValueType().getVectorElementType() == MVT::i1) {
	assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
	"Unexpected VT");
	assert(!St->isTruncatingStore() && "Expected non-truncating store");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), StoredVal,
	DAG.getIntPtrConstant(0, dl));
	StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
	StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	if (St->isTruncatingStore())
	return SDValue();

	// If this is a 256-bit store of concatenated ops, we are better off splitting
	// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
	// and each half can execute independently. Some cores would split the op into
	// halves anyway, so the concat (vinsertf128) is purely an extra op.
	MVT StoreVT = StoredVal.getSimpleValueType();
	if (StoreVT.is256BitVector()) {
	SmallVector<SDValue, 4> CatOps;
	if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
	return splitVectorStore(St, DAG);
	return SDValue();
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
	"Unexpected VT");
	assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
	TargetLowering::TypeWidenVector && "Unexpected type action!");

	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
	DAG.getUNDEF(StoreVT));

	if (Subtarget.hasSSE2()) {
	// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
	// and store it.
	MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
	MVT CastVT = MVT::getVectorVT(StVT, 2);
	StoredVal = DAG.getBitcast(CastVT, StoredVal);
	StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
	DAG.getIntPtrConstant(0, dl));

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
	St->getMemOperand());
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector loads.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
	if (RegVT.getVectorElementType() == MVT::i1) {
	assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
	assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());

	// Replace chain users with the new chain.
	assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");

	SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
	DAG.getBitcast(MVT::v16i1, Val),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
	}

	return SDValue();
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition(X86Cond);

	CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode0 =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode0 = X86::GetOppositeBranchCondition(CCode0);
	CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
	Dest, CC, Cmp);
	X86::CondCode CCode1 =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode1 = X86::GetOppositeBranchCondition(CCode1);
	CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	MaybeAlign Alignment(Op.getConstantOperandVal(2));
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const Align StackAlign(TFI.getStackAlignment());
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Alignment && Alignment > StackAlign)
	Result =
	DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Alignment) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
	static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
	switch (Opc) {
	case ISD::SHL:
	case X86ISD::VSHL:
	case X86ISD::VSHLI:
	return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
	case ISD::SRL:
	case X86ISD::VSRL:
	case X86ISD::VSRLI:
	return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
	case ISD::SRA:
	case X86ISD::VSRA:
	case X86ISD::VSRAI:
	return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
	}
	llvm_unreachable("Unknown target vector shift node");
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();

	switch (Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// Must produce 0s in the correct bits.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// Must produce 0s in the correct bits.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// All shifted in bits must be the same so use 0.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version.
	Opc = getTargetVShiftUniformOpcode(Opc, true);

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +====================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +====================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| No \| byte-shift-in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +====================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 \|\|
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
	ShAmt = ShAmt.getOperand(0);
	MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
	if (Subtarget.hasSSE41())
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	else {
	SDValue ByteShift = DAG.getTargetConstant(
	(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
	ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
	ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	}
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
	DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
	SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
	DAG.getBitcast(MVT::v8i1, Mask),
	DAG.getIntPtrConstant(0, dl));
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_SAE \|\|
	Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

	return false;
	};
	auto isRoundModeSAE = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	unsigned RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	// As a convenience we allow no other bits or explicitly
	// current direction.
	return RC == 0 \|\| RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
	}
	}

	return false;
	};
	auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT \|\|
	RC == X86::STATIC_ROUNDING::TO_NEG_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_POS_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_ZERO;
	}
	}

	return false;
	};

	SDLoc dl(Op);
	unsigned IntNo = Op.getConstantOperandVal(0);
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP: {
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(2);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1),
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1));
	}
	case INTR_TYPE_1OP_SAE: {
	SDValue Sae = Op.getOperand(2);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_2OP: {
	SDValue Src2 = Op.getOperand(2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(3);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1), Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Src2);
	}
	case INTR_TYPE_2OP_SAE: {
	SDValue Sae = Op.getOperand(3);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	case INTR_TYPE_3OP:
	case INTR_TYPE_3OP_IMM8: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src1, Src2, Src3,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	{Src1, Src2, Src3});
	}
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RC Opcode is specified and
	// - RC is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getVectorMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, PassThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getVectorMaskingNode(
	DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
	Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK_SAE: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Rnd = Op.getOperand(4);

	unsigned Opc;
	if (isRoundModeCurDirection(Rnd))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Rnd))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
	Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getScalarMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, passThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	unsigned Opc = IntrData->Opc0;
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrWithRoundingModeOpcode;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RND: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Rnd = Op.getOperand(5);

	SDValue NewOp;
	unsigned RC = 0;
	if (isRoundModeCurDirection(Rnd))
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	else if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else
	return SDValue();

	return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Sae = Op.getOperand(5);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue NewOp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	if (!NewOp)
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case BLENDV: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
	Src3 = DAG.getBitcast(MaskVT, Src3);

	// Reverse the operands to match VSELECT order.
	return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
	}
	case VPERM_2OP : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);

	// Swap Src1 and Src2 in the node creation
	return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
	}
	case IFMA_OP:
	// NOTE: We need to swizzle the operands to pass the multiply operands
	// first.
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	FPclassMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}

	case CMP_MASK_CC: {
	MVT MaskVT = Op.getSimpleValueType();
	SDValue CC = Op.getOperand(3);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(4);
	if (isRoundModeSAE(Sae))
	return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Sae);
	if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	return DAG.getNode(IntrData->Opc0, dl, MaskVT,
	{Op.getOperand(1), Op.getOperand(2), CC});
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	if (!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	CmpMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = Op.getConstantOperandVal(3);
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getTargetConstant(CondVal, dl, MVT::i8));
	else if (isRoundModeSAE(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
	DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
	else
	return SDValue();
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getConstant(0, dl, MVT::v16i1),
	FCmp, DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
	DAG.getBitcast(MVT::i16, Ins));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
	return Op.getOperand(1);

	// Avoid false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, VT);

	return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
	Mask);
	}
	case FIXUPIMM:
	case FIXUPIMM_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM)
	? Src1
	: getZeroVector(VT, Subtarget, DAG, dl);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

	if (Opc == X86ISD::VFIXUPIMM \|\| Opc == X86ISD::VFIXUPIMM_SAE)
	return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

	return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	auto Round = cast<ConstantSDNode>(Op.getOperand(2));
	SDValue RoundingMode =
	DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	auto Round = cast<ConstantSDNode>(Op.getOperand(3));
	SDValue RoundingMode =
	DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	case BEXTRI: {
	assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");

	// The control is a TargetConstant, but we need to convert it to a
	// ConstantSDNode.
	uint64_t Imm = Op.getConstantOperandVal(2);
	SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Control);
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

	SDValue Res;
	// If the carry in is zero, then we should just use ADD/SUB instead of
	// ADC/SBB.
	if (isNullConstant(Op.getOperand(1))) {
	Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
	Op.getOperand(3));
	} else {
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
	DAG.getConstant(-1, dl, MVT::i8));
	Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
	Op.getOperand(3), GenCF.getValue(1));
	}
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Res };
	return DAG.getMergeValues(Results, dl);
	}
	case CVTPD2PS_MASK:
	case CVTPD2DQ_MASK:
	case CVTQQ2PS_MASK:
	case TRUNCATE_TO_REG: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
	{Src, PassThru, Mask});
	}
	case CVTPS2PH_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue Rnd = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
	PassThru, Mask);

	}
	case CVTNEPS2BF16_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (ISD::isBuildVectorAllOnes(Mask.getNode()))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	// Break false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	unsigned TestOpc = X86ISD::PTEST;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	// CF = 1
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
	SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::x86_sse42_pcmpistrm128:
	case Intrinsic::x86_sse42_pcmpestrm128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::eh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else { // Handles the SP or FP case.
	bool CantUseFP = RegInfo->needsStackRealignment(MF);
	if (CantUseFP)
	Reg = RegInfo->getPtrSizedStackRegister(MF);
	else
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	}
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}

	case Intrinsic::x86_avx512_vp2intersect_q_512:
	case Intrinsic::x86_avx512_vp2intersect_q_256:
	case Intrinsic::x86_avx512_vp2intersect_q_128:
	case Intrinsic::x86_avx512_vp2intersect_d_512:
	case Intrinsic::x86_avx512_vp2intersect_d_256:
	case Intrinsic::x86_avx512_vp2intersect_d_128: {
	MVT MaskVT = Op.getSimpleValueType();

	SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
	SDLoc DL(Op);

	SDValue Operation =
	DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
	Op->getOperand(1), Op->getOperand(2));

	SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
	MaskVT, Operation);
	SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
	MaskVT, Operation);
	return DAG.getMergeValues({Result0, Result1}, DL);
	}
	case Intrinsic::x86_mmx_pslli_w:
	case Intrinsic::x86_mmx_pslli_d:
	case Intrinsic::x86_mmx_pslli_q:
	case Intrinsic::x86_mmx_psrli_w:
	case Intrinsic::x86_mmx_psrli_d:
	case Intrinsic::x86_mmx_psrli_q:
	case Intrinsic::x86_mmx_psrai_w:
	case Intrinsic::x86_mmx_psrai_d: {
	SDLoc DL(Op);
	SDValue ShAmt = Op.getOperand(2);
	// If the argument is a constant, convert it to a target constant.
	if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
	// Clamp out of bounds shift amounts since they will otherwise be masked
	// to 8-bits which may make it no longer out of bounds.
	unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	Op.getOperand(0), Op.getOperand(1),
	DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
	}

	unsigned NewIntrinsic;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_mmx_pslli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psll_w;
	break;
	case Intrinsic::x86_mmx_pslli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psll_d;
	break;
	case Intrinsic::x86_mmx_pslli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psll_q;
	break;
	case Intrinsic::x86_mmx_psrli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
	break;
	case Intrinsic::x86_mmx_psrli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
	break;
	case Intrinsic::x86_mmx_psrli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
	break;
	case Intrinsic::x86_mmx_psrai_w:
	NewIntrinsic = Intrinsic::x86_mmx_psra_w;
	break;
	case Intrinsic::x86_mmx_psrai_d:
	NewIntrinsic = Intrinsic::x86_mmx_psra_d;
	break;
	}

	// The vector shift intrinsics with scalars uses 32b shift amounts but
	// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
	// MMX register.
	ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(NewIntrinsic, DL, MVT::i32),
	Op.getOperand(1), ShAmt);

	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
	}

	static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	VT.getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the gather intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	Src.getSimpleValueType().getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the scatter intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return Res.getValue(1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsics with chain that return their
	/// value into registers EDX:EAX.
	/// If operand ScrReg is a valid register identifier, then operand 2 of N is
	/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
	/// TargetOpcode.
	/// Returns a Glue value which can be used to add extra copy-from-reg if the
	/// expanded intrinsics implicitly defines extra registers (i.e. not just
	/// EDX:EAX).
	static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	unsigned TargetOpcode,
	unsigned SrcReg,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDValue Chain = N->getOperand(0);
	SDValue Glue;

	if (SrcReg) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
	Glue = Chain.getValue(1);
	}

	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue N1Ops[] = {Chain, Glue};
	SDNode *N1 = DAG.getMachineNode(
	TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	SDValue LO, HI;
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);
	Glue = HI.getValue(2);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return Glue;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	return Glue;
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
	/* NoRegister */0, Subtarget,
	Results);
	if (Opcode != X86::RDTSCP)
	return;

	SDValue Chain = Results[1];
	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
	Results[1] = ecx;
	Results.push_back(ecx.getValue(1));
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 3> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Val, Ptr, Mask };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = Op.getConstantOperandVal(1);
	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_rdpkru: {
	SDLoc dl(Op);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	// Create a RDPKRU node and pass 0 to the ECX parameter.
	return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_wrpkru: {
	SDLoc dl(Op);
	// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
	// to the EDX and ECX parameters.
	return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
	Op.getOperand(0), Op.getOperand(2),
	DAG.getConstant(0, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during FinalizeISel in EmitInstrWithCustomInserter.
	return Op;
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	case Intrinsic::x86_umwait:
	case Intrinsic::x86_tpause: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;

	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic");
	case Intrinsic::x86_umwait:
	Opcode = X86ISD::UMWAIT;
	break;
	case Intrinsic::x86_tpause:
	Opcode = X86ISD::TPAUSE;
	break;
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	Opcode = X86ISD::LWPINS;
	break;
	}

	SDValue Operation =
	DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	case Intrinsic::x86_enqcmd:
	case Intrinsic::x86_enqcmds: {
	SDLoc dl(Op);
	SDValue Chain = Op.getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic!");
	case Intrinsic::x86_enqcmd:
	Opcode = X86ISD::ENQCMD;
	break;
	case Intrinsic::x86_enqcmds:
	Opcode = X86ISD::ENQCMDS;
	break;
	}
	SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
	Op.getOperand(3));
	SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1)};
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	const APInt &HintVal = Op.getConstantOperandAPInt(6);
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC:
	// GetExtended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;

	// RDPMC uses ECX to select the index of the performance counter to read.
	// XGETBV uses ECX to select the index of the XCR register to return.
	// The result is stored into registers EDX:EAX.
	expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
	Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Offset = DAG.getUNDEF(VMask.getValueType());

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
	MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
	true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = Op.getConstantOperandVal(0);
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /SPOffset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = Op.getConstantOperandVal(0);
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	Register Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	case CallingConv::Tail:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const Align StackAlignment(TFI.getStackAlignment());
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI =
	MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	SDValue Src = Op.getOperand(0);
	assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
	"Src and Op should have the same element type!");

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (NumElems > 16 \|\|
	(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = DAG.getConstant(0, DL, CurrVT);

	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = Op0;
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI() &&
	// vXi8 vectors need to be promoted to 512-bits for vXi32.
	(Subtarget.canExtendTo512DQ() \|\| VT.getVectorElementType() != MVT::i8))
	return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);

	assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
	DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32)
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return split256IntArith(Op, DAG);
	}

	static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	if (VT.getScalarType() == MVT::i1) {
	SDLoc dl(Op);
	switch (Opcode) {
	default: llvm_unreachable("Expected saturated arithmetic opcode");
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	// *addsat i1 X, Y --> X \| Y
	return DAG.getNode(ISD::OR, dl, VT, X, Y);
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	// *subsat i1 X, Y --> X & ~Y
	return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
	}
	}

	if (VT.is128BitVector()) {
	// Avoid the generic expansion with min/max if we don't have pminu/pmaxu.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), VT);
	SDLoc DL(Op);
	if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
	// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
	}
	if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
	// usubsat X, Y --> (X >u Y) ? X - Y : 0
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
	}
	// Use default expansion.
	return SDValue();
	}

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return split256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && Subtarget.hasSSE41()) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Sub =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
	return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
	}

	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	assert(VT.isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	// Default to expand.
	return SDValue();
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// For AVX1 cases, split to use legal ops (everything but v4i64).
	if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
	return split256IntArith(Op, DAG);

	SDLoc DL(Op);
	unsigned Opcode = Op.getOpcode();
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
	// using the SMIN/SMAX instructions and flipping the signbit back.
	if (VT == MVT::v8i16) {
	assert((Opcode == ISD::UMIN \|\| Opcode == ISD::UMAX) &&
	"Unexpected MIN/MAX opcode");
	SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
	N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
	N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
	Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
	SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
	return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
	}

	// Else, expand to a compare/select.
	ISD::CondCode CC;
	switch (Opcode) {
	case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
	case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
	case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
	case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
	default: llvm_unreachable("Unknown MINMAX opcode");
	}

	SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
	return DAG.getSelect(DL, VT, Cond, N0, N1);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return split256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
	}

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Extract the lo/hi parts to any extend to i16.
	// We're going to mask off the low byte of each result element of the
	// pmullw, so it doesn't matter what's in the high byte of each 16-bit
	// element.
	SDValue Undef = DAG.getUNDEF(VT);
	SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
	SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
	MVT::i16));
	HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
	MVT::i16));
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, A),
	DAG.getBitcast(MVT::v2i64, B));
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Aodds),
	DAG.getBitcast(MVT::v2i64, Bodds));

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");
	assert(!Subtarget.hasDQI() && "DQI should use MULLQ");

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	KnownBits AKnown = DAG.computeKnownBits(A);
	KnownBits BKnown = DAG.computeKnownBits(B);

	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
	bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
	bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

	SDValue Zero = DAG.getConstant(0, dl, VT);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	bool IsSigned = Op->getOpcode() == ISD::MULHS;
	unsigned NumElts = VT.getVectorNumElements();
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return split256IntArith(Op, DAG);

	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32) {
	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
	9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	unsigned Opcode =
	(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B)));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, Odd0),
	DAG.getBitcast(MulVT, Odd1)));

	// Shuffle it back into the right order.
	SmallVector<int, 16> ShufMask(NumElts);
	for (int i = 0; i != (int)NumElts; ++i)
	ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

	SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

	// If we have a signed multiply but no PMULDQ fix up the result of an
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
	}

	return Res;
	}

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
	Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}

	// For signed 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8 && IsSigned)
	return split512IntArith(Op, DAG);

	// Signed AVX2 implementation - extend xmm subvectors to ymm.
	if (VT == MVT::v32i8 && IsSigned) {
	MVT ExVT = MVT::v16i16;
	SDValue ALo = extract128BitVector(A, 0, DAG, dl);
	SDValue BLo = extract128BitVector(B, 0, DAG, dl);
	SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
	SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
	ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
	BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
	AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
	BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
	SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	// Shuffle lowering should turn this into PACKUS+PERMQ
	Lo = DAG.getBitcast(VT, Lo);
	Hi = DAG.getBitcast(VT, Hi);
	return DAG.getVectorShuffle(VT, dl, Lo, Hi,
	{ 0, 2, 4, 6, 8, 10, 12, 14,
	16, 18, 20, 22, 24, 26, 28, 30,
	32, 34, 36, 38, 40, 42, 44, 46,
	48, 50, 52, 54, 56, 58, 60, 62});
	}

	// For signed v16i8 and all unsigned vXi8 we will unpack the low and high
	// half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
	// shift the results and pack the half lane results back together.

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};

	// Extract the lo parts and zero/sign extend to i16.
	// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
	// shifts to sign extend. Using unpack for unsigned only requires an xor to
	// create zeros and a copy due to tied registers contraints pre-avx. But using
	// zero_extend_vector_inreg would require an additional pshufd for the high
	// part.

	SDValue ALo, AHi;
	if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);

	AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
	AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
	} else if (IsSigned) {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));

	ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
	AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
	} else {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	}

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh and extend.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	SDValue LoOp = B.getOperand(i + j);
	SDValue HiOp = B.getOperand(i + j + 8);

	if (IsSigned) {
	LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
	} else {
	LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
	}

	LoOps.push_back(LoOp);
	HiOps.push_back(HiOp);
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);

	BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
	BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
	} else if (IsSigned) {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));

	BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
	BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to vXi8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SRL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	APInt APIntShiftAmt;
	if (!X86::isConstantSplat(Amt, APIntShiftAmt))
	return SDValue();

	// If the shift amount is out of range, return undef.
	if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
	return DAG.getUNDEF(VT);

	uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\| (Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = DAG.getConstant(0, dl, VT);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
	return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);

	if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
	MVT EltVT = VT.getVectorElementType();
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}

	// vXi8 shifts - shift as v8i16 + mask result.
	if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) \|\|
	(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) \|\|
	VT == MVT::v64i8) &&
	!Subtarget.hasXOP()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
	if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
	unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
	unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	// Create the mask using vXi16 shifts. For shift-rights we need to move
	// the upper byte down before splatting the vXi8 mask.
	SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
	BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
	BaseShAmt, Subtarget, DAG);
	if (Opcode != ISD::SHL)
	BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
	8, DAG);
	BitMask = DAG.getBitcast(VT, BitMask);
	BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
	SmallVector<int, 64>(NumElts, 0));

	SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
	DAG.getBitcast(ExtVT, R), BaseShAmt,
	Subtarget, DAG);
	Res = DAG.getBitcast(VT, Res);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

	if (Opcode == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
	// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
	SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
	SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
	BaseShAmt, Subtarget, DAG);
	SignMask = DAG.getBitcast(VT, SignMask);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
	}
	return Res;
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	// Convert a shift/rotate left amount to a multiplication scale factor.
	static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Amt.getSimpleValueType();
	if (!(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16) \|\|
	(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
	return SDValue();

	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	return DAG.getBuildVector(VT, dl, Elts);
	}

	// If the target doesn't support variable shifts, use either FP conversion
	// or integer multiplication to avoid shifting each element individually.
	if (VT == MVT::v4i32) {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
	DAG.getConstant(0x3f800000U, dl, VT));
	Amt = DAG.getBitcast(MVT::v4f32, Amt);
	return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
	}

	// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
	if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
	SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
	Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
	Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
	if (Subtarget.hasSSE41())
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

	return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
	DAG.getBitcast(VT, Hi),
	{0, 2, 4, 6, 8, 10, 12, 14});
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	unsigned Opc = Op.getOpcode();
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Opc))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() && (VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Opc == ISD::SRL \|\| Opc == ISD::SRA) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Opc == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Opc != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Opc == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a BLENDing shuffle instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes in parallel before blending.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue Amt1, Amt2;
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ShuffleMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue A = Amt->getOperand(i);
	if (A.isUndef()) {
	ShuffleMask.push_back(SM_SentinelUndef);
	continue;
	}
	if (!Amt1 \|\| Amt1 == A) {
	ShuffleMask.push_back(i);
	Amt1 = A;
	continue;
	}
	if (!Amt2 \|\| Amt2 == A) {
	ShuffleMask.push_back(i + NumElts);
	Amt2 = A;
	continue;
	}
	break;
	}

	// Only perform this blend if we can perform it without loading a mask.
	if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
	(VT != MVT::v16i16 \|\|
	is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
	(VT == MVT::v4i32 \|\| Subtarget.hasSSE41() \|\| Opc != ISD::SHL \|\|
	canWidenShuffleElements(ShuffleMask))) {
	auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
	auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
	if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
	Cst2->getAPIntValue().ult(EltSizeInBits)) {
	SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst1->getZExtValue(), DAG);
	SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst2->getZExtValue(), DAG);
	return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
	}
	}
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	if (Opc == ISD::SHL)
	if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
	return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

	// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
	if (Opc == ISD::SRL && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
	SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
	return DAG.getSelect(dl, VT, ZAmt, R, Res);
	}
	}

	// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
	// TODO: Special case handling for shift by 0/1, really we can afford either
	// of these cases in pre-SSE41/XOP/AVX512 but not both.
	if (Opc == ISD::SRA && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
	((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
	!Subtarget.hasAVX512()) \|\|
	DAG.isKnownNeverZero(Amt))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Amt0 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
	SDValue Amt1 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
	SDValue Sra1 =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
	SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
	Res = DAG.getSelect(dl, VT, Amt0, R, Res);
	return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. On AVX we're better off
	// just zero-extending, but for SSE just duplicating the top 16-bits is
	// cheaper and has the same effect for out of range values.
	if (Subtarget.hasAVX()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	} else {
	SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
	SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{4, 5, 6, 7, -1, -1, -1, -1});
	Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{2, 3, 3, 3, -1, -1, -1, -1});
	Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{2, 3, 3, 3, -1, -1, -1, -1});
	}
	}

	unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
	SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
	SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
	SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
	SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

	// Merge the shifted lane results optimally with/without PBLENDW.
	// TODO - ideally shuffle combining would handle this.
	if (Subtarget.hasSSE41()) {
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}
	SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
	SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	// NOTE: We honor prefered vector width before promoting to 512-bits.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) \|\|
	(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Opc, dl, ExtVT, R, Amt));
	}

	// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
	// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
	if (ConstantAmt && (Opc == ISD::SRA \|\| Opc == ISD::SRL) &&
	(VT == MVT::v16i8 \|\| VT == MVT::v64i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256())) &&
	!Subtarget.hasXOP()) {
	int NumElts = VT.getVectorNumElements();
	SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

	// Extend constant shift amount to vXi16 (it doesn't matter if the type
	// isn't legal).
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
	Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
	assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
	"Constant build vector expected");

	if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
	R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
	: DAG.getZExtOrTrunc(R, dl, ExVT);
	R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
	R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
	return DAG.getZExtOrTrunc(R, dl, VT);
	}

	SmallVector<SDValue, 16> LoAmt, HiAmt;
	for (int i = 0; i != NumElts; i += 16) {
	for (int j = 0; j != 8; ++j) {
	LoAmt.push_back(Amt.getOperand(i + j));
	HiAmt.push_back(Amt.getOperand(i + j + 8));
	}
	}

	MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
	SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
	SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

	SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
	SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
	LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
	LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
	HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
	LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
	ISD::SETGT);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, dl, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
	Amt = DAG.getBitcast(VT, Amt);

	if (Opc == ISD::SHL \|\| Opc == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Opc == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
	SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte meaning that we can safely pack with PACKUSWB.
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
	SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
	SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
	SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
	} else {
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into 128-bit shifts.
	if (VT.is256BitVector())
	return split256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.isVector() && "Custom lowering only for vector rotates!");

	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	int NumElts = VT.getVectorNumElements();

	// Check for constant splat rotation amount.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	int CstSplatIndex = -1;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
	for (int i = 0; i != NumElts; ++i)
	if (!UndefElts[i]) {
	if (CstSplatIndex < 0 \|\| EltBits[i] == EltBits[CstSplatIndex]) {
	CstSplatIndex = i;
	continue;
	}
	CstSplatIndex = -1;
	break;
	}

	// AVX512 implicitly uses modulo rotation amounts.
	if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
	// Attempt to rotate by immediate.
	if (0 <= CstSplatIndex) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
	// XOP implicitly uses modulo rotation amounts.
	if (Subtarget.hasXOP()) {
	if (VT.is256BitVector())
	return split256IntArith(Op, DAG);
	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (0 <= CstSplatIndex) {
	uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	// Split 256-bit integers on pre-AVX2 targets.
	if (VT.is256BitVector() && !Subtarget.hasAVX2())
	return split256IntArith(Op, DAG);

	assert((VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v16i8 \|\|
	((VT == MVT::v8i32 \|\| VT == MVT::v16i16 \|\| VT == MVT::v32i8) &&
	Subtarget.hasAVX2())) &&
	"Only vXi32/vXi16/vXi8 vector rotates supported");

	// Rotate by an uniform constant - expand back to shifts.
	if (0 <= CstSplatIndex)
	return SDValue();

	bool IsSplatAmt = DAG.isSplatValue(Amt);

	// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
	// the amount bit.
	if (EltSizeInBits == 8 && !IsSplatAmt) {
	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
	return SDValue();

	// We don't need ModuloAmt here as we just peek at individual bits.
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, DL, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
	return DAG.getSelect(DL, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	// r = VSELECT(r, rot(r, 4), a);
	SDValue M;
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// r = VSELECT(r, rot(r, 2), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// return VSELECT(r, rot(r, 1), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
	return SignBitSelect(VT, Amt, M, R);
	}

	// ISD::ROT* uses modulo rotate amounts.
	Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));

	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
	bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
	SupportedVectorVarShift(VT, Subtarget, ISD::SRL);

	// Fallback for splats + all supported variable shifts.
	// Fallback for non-constants AVX2 vXi16 as well.
	if (IsSplatAmt \|\| LegalVarShifts \|\| (Subtarget.hasAVX2() && !ConstantAmt)) {
	SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
	AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
	SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
	SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
	return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
	}

	// As with shifts, convert the rotation amount to a multiplication factor.
	SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
	assert(Scale && "Failed to convert ROTL amount to scale");

	// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
	if (EltSizeInBits == 16) {
	SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
	SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
	// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
	// that can then be OR'd with the lower 32-bits.
	assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
	static const int OddMask[] = {1, -1, 3, -1};
	SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
	SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

	SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R),
	DAG.getBitcast(MVT::v2i64, Scale));
	SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R13),
	DAG.getBitcast(MVT::v2i64, Scale13));
	Res02 = DAG.getBitcast(VT, Res02);
	Res13 = DAG.getBitcast(VT, Res13);

	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
	if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();

	return false;
	}

	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	// TODO: In 32-bit mode, use FISTP when X87 is available?
	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	Type *MemType = SI->getValueOperand()->getType();

	bool NoImplicitFloatOps =
	SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
	return false;

	return needsCmpXchgNb(MemType);
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	Type *MemType = LI->getType();

	// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
	// can use movq to do the load. If we have X87 we can load into an 80-bit
	// X87 register and store it to a stack temporary.
	bool NoImplicitFloatOps =
	LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE2() \|\| Subtarget.hasX87()))
	return AtomicExpansionKind::None;

	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	case AtomicRMWInst::FAdd:
	case AtomicRMWInst::FSub:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	// If this is a canonical idempotent atomicrmw w/no uses, we have a better
	// lowering available in lowerAtomicArith.
	// TODO: push more cases through this path.
	if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
	if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
	AI->use_empty())
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded =
	Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
	if (!SI.isUnordered())
	return false;
	return ExperimentalUnorderedISEL;
	}
	bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
	if (!LI.isUnordered())
	return false;
	return ExperimentalUnorderedISEL;
	}


	/// Emit a locked operation on a stack location which does not change any
	/// memory location, but does involve a lock prefix. Location is chosen to be
	/// a) very likely accessed only by a single thread to minimize cache traffic,
	/// and b) definitely dereferenceable. Returns the new Chain result.
	static SDValue emitLockedStackOp(SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue Chain, SDLoc DL) {
	// Implementation notes:
	// 1) LOCK prefix creates a full read/write reordering barrier for memory
	// operations issued by the current processor. As such, the location
	// referenced is not relevant for the ordering properties of the instruction.
	// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
	// 2) Using an immediate operand appears to be the best encoding choice
	// here since it doesn't require an extra register.
	// 3) OR appears to be very slightly faster than ADD. (Though, the difference
	// is small enough it might just be measurement noise.)
	// 4) When choosing offsets, there are several contributing factors:
	// a) If there's no redzone, we default to TOS. (We could allocate a cache
	// line aligned stack object to improve this case.)
	// b) To minimize our chances of introducing a false dependence, we prefer
	// to offset the stack usage from TOS slightly.
	// c) To minimize concerns about cross thread stack usage - in particular,
	// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
	// captures state in the TOS frame and accesses it from many threads -
	// we want to use an offset such that the offset is in a distinct cache
	// line from the TOS frame.
	//
	// For a general discussion of the tradeoffs and benchmark results, see:
	// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

	auto &MF = DAG.getMachineFunction();
	auto &TFL = *Subtarget.getFrameLowering();
	const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

	if (Subtarget.is64Bit()) {
	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::RSP, MVT::i64), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i64), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering =
	static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
	SyncScope::ID FenceSSID =
	static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	return emitLockedStackOp(DAG, Subtarget, Chain, dl);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
	cpOut, Success, EFLAGS.getValue(1));
	}

	// Create MOVMSKB, taking into account whether we need to split for AVX1.
	static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT InVT = V.getSimpleValueType();

	if (InVT == MVT::v64i8) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
	Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
	DAG.getConstant(32, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
	}
	if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	}

	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
	// half to v32i1 and concatenating the result.
	if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	assert(Subtarget.hasBWI() && "Expected BWI target");
	SDLoc dl(Op);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(1, dl));
	Hi = DAG.getBitcast(MVT::v32i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	}

	// Custom splitting for BWI types when AVX512F is available but BWI isn't.
	if ((SrcVT == MVT::v32i16 \|\| SrcVT == MVT::v64i8) && DstVT.isVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
	SDLoc dl(Op);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
	MVT CastVT = DstVT.getHalfNumVectorElementsVT();
	Lo = DAG.getBitcast(CastVT, Lo);
	Hi = DAG.getBitcast(CastVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
	}

	// Use MOVMSK for vector to scalar conversion to prevent scalarization.
	if ((SrcVT == MVT::v16i1 \|\| SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
	assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
	MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
	SDLoc DL(Op);
	SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	return DAG.getZExtOrTrunc(V, DL, DstVT);
	}

	assert((SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) && "Unexpected VT!");

	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
	!(DstVT == MVT::x86mmx && SrcVT.isVector()))
	// This conversion needs to be expanded.
	return SDValue();

	SDLoc dl(Op);
	if (SrcVT.isVector()) {
	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
	SrcVT.getVectorNumElements() * 2);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
	DAG.getUNDEF(SrcVT));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
	}

	MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
	Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

	if (DstVT == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = DAG.getConstant(0, DL, VT);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
	SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	int NumElts = VT.getVectorNumElements();
	(void)EltVT;
	assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, VT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, VT);
	SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

	// Low nibbles
	SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
	SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
	return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems < 16 \|\| (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	// For element types greater than i8, do vXi8 pop counts and a bytesum.
	if (VT.getScalarType() != MVT::i8) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
	SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
	return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
	}

	// We can't use the fast LUT approach, so fall back on LegalizeDAG.
	if (!Subtarget.hasSSSE3())
	return SDValue();

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
	// lowering.
	if (VT == MVT::v8i64 \|\| VT == MVT::v16i32) {
	assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
	return Lower512IntUnary(Op, DAG);
	}

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	// Specialized lowering for the canonical form of an idemptotent atomicrmw.
	// The core idea here is that since the memory location isn't actually
	// changing, all we need is a lowering for the ordering impacts of the
	// atomicrmw. As such, we can chose a different operation and memory
	// location to minimize impact on other code.
	if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
	// On X86, the only ordering which actually requires an instruction is
	// seq_cst which isn't SingleThread, everything just needs to be preserved
	// during codegen and then dropped. Note that we expect (but don't assume),
	// that orderings other than seq_cst and acq_rel have been canonicalized to
	// a store or load.
	if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
	AN->getSyncScopeID() == SyncScope::System) {
	// Prefer a locked operation against a stack location to minimize cache
	// traffic. This assumes that stack locations are very likely to be
	// accessed only by the owning thread.
	SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}
	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), LockOp.getValue(1));
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDLoc dl(Node);
	EVT VT = Node->getMemoryVT();

	bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
	bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

	// If this store is not sequentially consistent and the type is legal
	// we can just keep it.
	if (!IsSeqCst && IsTypeLegal)
	return Op;

	if (VT == MVT::i64 && !IsTypeLegal) {
	// For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
	// FIXME: Use movlps with SSE1.
	// FIXME: Use fist with X87.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	Subtarget.hasSSE2()) {
	SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Node->getOperand(2));
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
	Ops, MVT::i64,
	Node->getMemOperand());

	// If this is a sequentially consistent store, also emit an appropriate
	// barrier.
	if (IsSeqCst)
	Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

	return Chain;
	}
	}

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	Node->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	Node->getMemOperand());
	return Swap.getValue(1);
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Scale = N->getScale();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();

	if (VT == MVT::v2f32 \|\| VT == MVT::v2i32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
	SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}
	return SDValue();
	}

	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	// If the index is v2i32, we're being called by type legalization and we
	// should just let the default handling take care of it.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());
	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	Src = ExtendToType(Src, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	MVT MaskVT = Mask.getSimpleValueType();
	SDValue PassThru = N->getPassThru();
	SDLoc dl(Op);

	// Handle AVX masked loads which don't support passthru other than 0.
	if (MaskVT.getVectorElementType() != MVT::i1) {
	// We also allow undef in the isel pattern.
	if (PassThru.isUndef() \|\| ISD::isBuildVectorAllZeros(PassThru.getNode()))
	return Op;

	SDValue NewLoad = DAG.getMaskedLoad(
	VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
	N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
	N->isExpandingLoad());
	// Emit a blend.
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
	PassThru);
	return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
	}

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	PassThru = ExtendToType(PassThru, WideDataVT, DAG);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	SDValue NewLoad = DAG.getMaskedLoad(
	WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
	N->getExtensionType(), N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	N->getOffset(), Mask, N->getMemoryVT(),
	N->getMemOperand(), N->getAddressingMode(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue PassThru = N->getPassThru();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	MVT OrigVT = VT;
	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!IndexVT.is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());

	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	PassThru = ExtendToType(PassThru, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
	N->getScale() };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
	NewGather, DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
	}

	static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	MVT DstVT = Op.getSimpleValueType();

	AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
	unsigned SrcAS = N->getSrcAddressSpace();

	assert(SrcAS != N->getDestAddressSpace() &&
	"addrspacecast must be between different address spaces");

	if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
	} else if (DstVT == MVT::i64) {
	Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
	} else if (DstVT == MVT::i32) {
	Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
	} else {
	report_fatal_error("Bad address space in addrspacecast");
	}
	return Op;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {

	bool IsStrict = Op->isStrictFPOpcode();
	unsigned Offset = IsStrict ? 1 : 0;
	SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());

	SDLoc dl(Op);
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
	CallOptions, dl, Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::FSHL:
	case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
	case ISD::STRICT_SINT_TO_FP:
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::STRICT_UINT_TO_FP:
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND:
	case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
	case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
	case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
	case ISD::FADD:
	case ISD::FSUB: return lowerFaddFsub(Op, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC:
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
	case ISD::ADDRSPACECAST:
	return LowerADDRSPACECAST(Op, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	// If the original node has one result, take the return value from
	// LowerOperation as is. It might not be result number 0.
	if (N->getNumValues() == 1) {
	Results.push_back(Res);
	return;
	}

	// If the original node has multiple results, then the return node should
	// have the same number of results.
	assert((N->getNumValues() == Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ReplaceNodeResults: ";
	N->dump(&DAG);
	#endif
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case ISD::CTPOP: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	// Use a v2i64 if possible.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
	SDValue Wide =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
	Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
	// Bit count should fit in 32-bits, extract it as that and then zero
	// extend to i64. Otherwise we end up extracting bits 63:32 separately.
	Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
	Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
	DAG.getIntPtrConstant(0, dl));
	Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
	Results.push_back(Wide);
	}
	return;
	}
	case ISD::MUL: {
	EVT VT = N->getValueType(0);
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
	// Pre-promote these to vXi16 to avoid op legalization thinking all 16
	// elements are needed.
	MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
	SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	unsigned NumConcats = 16 / VT.getVectorNumElements();
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
	Results.push_back(Res);
	return;
	}
	case X86ISD::VPMADDWD:
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG/VPMADDWD by widening.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	EVT VT = N->getValueType(0);
	EVT InVT = N->getOperand(0).getValueType();
	assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
	"Expected a VT that divides into 128 bits.");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	unsigned NumConcat = 128 / InVT.getSizeInBits();

	EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	NumConcat * InVT.getVectorNumElements());
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	NumConcat * VT.getVectorNumElements());

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

	SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
	Results.push_back(Res);
	return;
	}
	case ISD::ABS: {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	assert(N->getValueType(0) == MVT::i64 &&
	"Unexpected type (!= i64) on ABS.");
	MVT HalfT = MVT::i32;
	SDValue Lo, Hi, Tmp;
	SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);

	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(0, dl, HalfT));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(1, dl, HalfT));
	Tmp = DAG.getNode(
	ISD::SRA, dl, HalfT, Hi,
	DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
	TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
	Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
	Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
	SDValue(Lo.getNode(), 1));
	Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
	Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
	Results.push_back(Lo);
	Results.push_back(Hi);
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	EVT VT = N->getValueType(0);
	if (VT.isVector()) {
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	// If this RHS is a constant splat vector we can widen this and let
	// division/remainder by constant optimize it.
	// TODO: Can we do something for non-splat?
	APInt SplatVal;
	if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
	unsigned NumConcats = 128 / VT.getSizeInBits();
	SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
	Ops0[0] = N->getOperand(0);
	EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
	SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
	SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
	Results.push_back(Res);
	}
	return;
	}

	LLVM_FALLTHROUGH;
	}
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::TRUNCATE: {
	MVT VT = N->getSimpleValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// The generic legalizer will try to widen the input type to the same
	// number of elements as the widened result type. But this isn't always
	// the best thing so do some custom legalization to avoid some cases.
	MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	unsigned InBits = InVT.getSizeInBits();
	if (128 % InBits == 0) {
	// 128 bit and smaller inputs should avoid truncate all together and
	// just use a build_vector that will become a shuffle.
	// TODO: Widen and use a shuffle directly?
	MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
	EVT EltVT = VT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = VT.getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
	DAG.getIntPtrConstant(i, dl));
	Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
	}
	Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
	return;
	}
	// With AVX512 there are some cases that can use a target specific
	// truncate node to go from 256/512 to less than 128 with zeros in the
	// upper elements of the 128 bit result.
	if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
	// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
	if ((InBits == 256 && Subtarget.hasVLX()) \|\| InBits == 512) {
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	// There's one case we can widen to 512 bits and use VTRUNC.
	if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
	In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
	DAG.getUNDEF(MVT::v4i64));
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	}
	if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
	getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
	isTypeLegal(MVT::v4i64)) {
	// Input needs to be split and output needs to widened. Let's use two
	// VTRUNCs, and shuffle their results together into the wider type.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

	Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
	Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
	SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
	{ 0, 1, 2, 3, 16, 17, 18, 19,
	-1, -1, -1, -1, -1, -1, -1, -1 });
	Results.push_back(Res);
	return;
	}

	return;
	}
	case ISD::ANY_EXTEND:
	// Right now, only MVT::v8i8 has Custom action for an illegal type.
	// It's intended to custom handle the input type.
	assert(N->getValueType(0) == MVT::v8i8 &&
	"Do not know how to legalize this Node");
	return;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v4i16 \|\| InVT == MVT::v4i8)){
	assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
	"Unexpected type action!");
	assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to v2i64.
	SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{0, 4, 1, 5});
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
	SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{2, 6, 3, 7});
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (VT == MVT::v16i32 \|\| VT == MVT::v8i64) {
	if (!InVT.is128BitVector()) {
	// Not a 128 bit vector, but maybe type legalization will promote
	// it to 128 bits.
	if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
	return;
	InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
	if (!InVT.is128BitVector())
	return;

	// Promote the input to 128 bits. Type legalization will turn this into
	// zext_inreg/sext_inreg.
	In = DAG.getNode(N->getOpcode(), dl, InVT, In);
	}

	// Perform custom splitting instead of the two stage extend we would get
	// by default.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	assert(isTypeLegal(LoVT) && "Split VT not legal?");

	SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);

	// We need to shift the input over by half the number of elements.
	unsigned NumElts = InVT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != HalfNumElts; ++i)
	ShufMask[i] = i + HalfNumElts;

	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT: {
	bool IsStrict = N->isStrictFPOpcode();
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT \|\|
	N->getOpcode() == ISD::STRICT_FP_TO_SINT;
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();

	if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");

	// Try to create a 128 bit vector, but don't exceed a 32 bit element.
	unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
	MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
	VT.getVectorNumElements());
	SDValue Res;
	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

	// Preserve what we know about the size of the original result. Except
	// when the result is v2i32 since we can't widen the assert.
	if (PromoteVT != MVT::v2i32)
	Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
	dl, PromoteVT, Res,
	DAG.getValueType(VT.getVectorElementType()));

	// Truncate back to the original width.
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

	// Now widen to 128 bits.
	unsigned NumConcats = 128 / VT.getSizeInBits();
	MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	VT.getVectorNumElements() * NumConcats);
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}


	if (VT == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	if (Src.getValueType() == MVT::v2f64) {
	unsigned Opc;
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

	// If we have VLX we can emit a target specific FP_TO_UINT node,.
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Otherwise we can defer to the generic legalizer which will widen
	// the input as well. This will be further widened during op
	// legalization to v8i32<-v8f64.
	// For strict nodes we'll need to widen ourselves.
	// FIXME: Fix the type legalizer to safely widen strict nodes?
	if (!IsStrict)
	return;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
	DAG.getConstantFP(0.0, dl, MVT::v2f64));
	Opc = N->getOpcode();
	}
	SDValue Res;
	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
	}
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	// Custom widen strict v2f32->v2i32 by padding with zeros.
	// FIXME: Should generic type legalizer do this?
	if (Src.getValueType() == MVT::v2f32 && IsStrict) {
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getConstantFP(0.0, dl, MVT::v2f32));
	SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	assert(!VT.isVector() && "Vectors should have been handled above!");

	if (Subtarget.hasDQI() && VT == MVT::i64 &&
	(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
	assert(!Subtarget.is64Bit() && "i64 should be legal");
	unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
	// If we use a 128-bit result we might need to use a target specific node.
	unsigned SrcElts =
	std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
	MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
	unsigned Opc = N->getOpcode();
	if (NumElts != SrcElts) {
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	}

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
	DAG.getConstantFP(0.0, dl, VecInVT), Src,
	ZeroIdx);
	SDValue Chain;
	if (IsStrict) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	SDValue Chain;
	if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
	Results.push_back(V);
	if (IsStrict)
	Results.push_back(Chain);
	}
	return;
	}
	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP: {
	bool IsStrict = N->isStrictFPOpcode();
	bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP \|\|
	N->getOpcode() == ISD::STRICT_SINT_TO_FP;
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	if (IsStrict) {
	unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
	: X86ISD::STRICT_CVTUI2P;
	SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	} else {
	unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
	Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
	}
	return;
	}
	if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
	Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
	SDValue Zero = DAG.getConstant(0, dl, SrcVT);
	SDValue One = DAG.getConstant(1, dl, SrcVT);
	SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
	DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
	DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
	SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
	SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
	SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
	for (int i = 0; i != 2; ++i) {
	SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
	SignSrc, DAG.getIntPtrConstant(i, dl));
	if (IsStrict)
	SignCvts[i] =
	DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
	{N->getOperand(0), Src});
	else
	SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
	};
	SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
	SDValue Slow, Chain;
	if (IsStrict) {
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	SignCvts[0].getValue(1), SignCvts[1].getValue(1));
	Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
	{Chain, SignCvt, SignCvt});
	Chain = Slow.getValue(1);
	} else {
	Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
	}
	IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
	IsNeg =
	DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
	SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
	Results.push_back(Cvt);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	if (SrcVT != MVT::v2i32)
	return;

	if (IsSigned \|\| Subtarget.hasAVX512()) {
	if (!IsStrict)
	return;

	// Custom widen strict v2i32->v2f32 to avoid scalarization.
	// FIXME: Should generic type legalizer do this?
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getConstant(0, dl, MVT::v2i32));
	SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}

	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	if (IsStrict) {
	SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
	{N->getOperand(0), Or, VBias});
	SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
	{MVT::v4f32, MVT::Other},
	{Sub.getValue(1), Sub});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	} else {
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	}
	return;
	}
	case ISD::STRICT_FP_ROUND:
	case ISD::FP_ROUND: {
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	if (!isTypeLegal(Src.getValueType()))
	return;
	SDValue V;
	if (IsStrict)
	V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), N->getOperand(1)});
	else
	V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	if (IsStrict)
	Results.push_back(V.getValue(1));
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = N->getConstantOperandVal(1);
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
	Results);
	return;
	case Intrinsic::x86_xgetbv:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
	Results);
	return;
	}
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	assert((!Regs64bit \|\| Subtarget.hasCmpxchg16b()) &&
	"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	Register BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_LOAD: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	auto *Node = cast<AtomicSDNode>(N);
	if (Subtarget.hasSSE2()) {
	// Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
	// lower 64-bits.
	SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register. This will put the whole
	// integer into the significand.
	// FIXME: Do we need to glue? See FIXME comment in BuildFILD.
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
	dl, Tys, Ops, MVT::i64,
	Node->getMemOperand());
	SDValue Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// Now store the X87 register to a stack temporary and convert to i64.
	// This store is not atomic and doesn't need to be.
	// FIXME: We don't need a stack temporary if the result of the load
	// is already being stored. We could just directly store there.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
	Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
	DAG.getVTList(MVT::Other), StoreOps,
	MVT::i64, MPI, 0 /Align/,
	MachineMemOperand::MOStore);

	// Finally load the value back from the stack temporary and return it.
	// This load is not atomic and doesn't need to be.
	// This load will be further type legalized.
	Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
	Results.push_back(Result);
	Results.push_back(Result.getValue(1));
	return;
	}
	}
	// TODO: Use MOVLPS when SSE1 is available?
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;

	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
	// we can split using the k-register rather than memory.
	if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	Lo = DAG.getBitcast(MVT::i32, Lo);
	Hi = DAG.getBitcast(MVT::i32, Hi);
	SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	// Custom splitting for BWI types when AVX512F is available but BWI isn't.
	if ((DstVT == MVT::v32i16 \|\| DstVT == MVT::v64i8) &&
	SrcVT.isVector() && isTypeLegal(SrcVT)) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
	Lo = DAG.getBitcast(CastVT, Lo);
	Hi = DAG.getBitcast(CastVT, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
	assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
	"Unexpected type action!");
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
	SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
	Results.push_back(Res);
	return;
	}

	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if ((VT == MVT::v2f32 \|\| VT == MVT::v2i32) &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
	Gather->getPassThru(),
	DAG.getUNDEF(VT));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(2));
	return;
	}
	return;
	}
	case ISD::LOAD: {
	// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
	// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
	// cast since type legalization will try to use an i64 load.
	MVT VT = N->getSimpleValueType(0);
	assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	if (!ISD::isNON_EXTLoad(N))
	return;
	auto *Ld = cast<LoadSDNode>(N);
	if (Subtarget.hasSSE2()) {
	MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
	SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue Chain = Res.getValue(1);
	MVT VecVT = MVT::getVectorVT(LdVT, 2);
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
	Res = DAG.getBitcast(WideVT, Res);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
	SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Ld->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	case ISD::ADDRSPACECAST: {
	SDValue Src = N->getOperand(0);
	EVT DstVT = N->getValueType(0);
	AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
	unsigned SrcAS = CastN->getSrcAddressSpace();

	assert(SrcAS != CastN->getDestAddressSpace() &&
	"addrspacecast must be between different address spaces");

	SDValue Res;
	if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
	Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
	else if (DstVT == MVT::i64)
	Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
	else if (DstVT == MVT::i32)
	Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
	else
	report_fatal_error("Unrecognized addrspacecast type legalization");

	Results.push_back(Res);
	return;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FIST: return "X86ISD::FIST";
	case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";
	case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";
	case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::BLENDV: return "X86ISD::BLENDV";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
	case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
	case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
	case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
	case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";
	case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
	case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
	case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";
	case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSHLV: return "X86ISD::VSHLV";
	case X86ISD::VSRLV: return "X86ISD::VSRLV";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::BEXTR: return "X86ISD::BEXTR";
	case X86ISD::BZHI: return "X86ISD::BZHI";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KADD: return "X86ISD::KADD";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::VSHLD: return "X86ISD::VSHLD";
	case X86ISD::VSHRD: return "X86ISD::VSHRD";
	case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
	case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
	case X86ISD::VRANGES: return "X86ISD::VRANGES";
	case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
	case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";
	case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
	case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
	case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP14: return "X86ISD::RCP14";
	case X86ISD::RCP14S: return "X86ISD::RCP14S";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
	case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
	case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS: return "X86ISD::FADDS";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS: return "X86ISD::FSUBS";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS: return "X86ISD::FMULS";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS: return "X86ISD::FDIVS";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
	case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
	case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
	case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";
	case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";
	case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
	case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
	case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
	case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
	case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
	case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
	case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
	case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";
	case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";
	case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
	case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
	case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
	case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
	case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
	case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
	case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
	case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
	case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
	case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
	case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
	case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
	case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
	case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
	case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
	case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
	case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
	case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
	case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
	case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
	case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
	case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
	if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
	(Bits == 8 \|\| Bits == 16 \|\| Bits == 32 \|\| Bits == 64))
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// AVX512BW has shifts such as vpsllvw.
	if (Subtarget.hasBWI() && Bits == 16)
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// These are non-commutative binops.
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::ANDNP:
	case X86ISD::PCMPGT:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case X86ISD::FANDN:
	return true;
	}

	return TargetLoweringBase::isBinOp(Opcode);
	}

	bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::PCMPEQ:
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ:
	case X86ISD::FMAXC:
	case X86ISD::FMINC:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR:
	return true;
	}

	return TargetLoweringBase::isCommutativeBinOp(Opcode);
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
	return false;

	EVT SrcVT = ExtVal.getOperand(0).getValueType();

	// There is no extending load for vXi1.
	if (SrcVT.getScalarType() == MVT::i1)
	return false;

	return true;
	}

	bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
	EVT VT) const {
	// Don't convert an 'and' into a shuffle that we don't directly support.
	// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
	if (!Subtarget.hasAVX2())
	if (VT == MVT::v32i8 \|\| VT == MVT::v16i16)
	return false;

	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	- // If the subtarget is using retpolines, we need to not generate jump tables.
	- if (Subtarget.useRetpolineIndirectBranches())
	+ // If the subtarget is using thunks, we need to not generate jump tables.
	+ if (Subtarget.useIndirectThunkBranches())
	return false;

	// Otherwise, fallback on the generic logic.
	return TargetLowering::areJTsAllowed(Fn);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	Register DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	Register DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	MachineFunction *MF = MBB->getParent();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");

	MachineMemOperand *OldMMO = MI.memoperands().front();

	// Clone the MMO into two separate MMOs for loading and storing
	MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
	MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
	.addMBB(overflowMBB).addImm(X86::COND_AE);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Zero-extend the offset
	Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(StoreOnlyMMO);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(StoreOnlyMMO);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	Register CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	Register DestReg = MIIt->getOperand(0).getReg();
	Register Op1Reg = MIIt->getOperand(1).getReg();
	Register Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	Register DestReg = FirstCMOV.getOperand(0).getReg();
	Register Op1Reg = FirstCMOV.getOperand(1).getReg();
	Register Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition. Skip over
	// intervening debug insts.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer any debug instructions inside the CMOV sequence to the sunk block.
	auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
	auto DbgIt = MachineBasicBlock::iterator(MI);
	while (DbgIt != DbgEnd) {
	auto Next = std::next(DbgIt);
	if (DbgIt->isDebugInstr())
	SinkMBB->push_back(DbgIt->removeFromParent());
	DbgIt = Next;
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->end(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction().getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	-static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
	+static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
	switch (RPOpc) {
	- case X86::RETPOLINE_CALL32:
	+ case X86::INDIRECT_THUNK_CALL32:
	return X86::CALLpcrel32;
	- case X86::RETPOLINE_CALL64:
	+ case X86::INDIRECT_THUNK_CALL64:
	return X86::CALL64pcrel32;
	- case X86::RETPOLINE_TCRETURN32:
	+ case X86::INDIRECT_THUNK_TCRETURN32:
	return X86::TCRETURNdi;
	- case X86::RETPOLINE_TCRETURN64:
	+ case X86::INDIRECT_THUNK_TCRETURN64:
	return X86::TCRETURNdi64;
	}
	- llvm_unreachable("not retpoline opcode");
	+ llvm_unreachable("not indirect thunk opcode");
	}

	-static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
	- unsigned Reg) {
	+static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
	+ unsigned Reg) {
	if (Subtarget.useRetpolineExternalThunk()) {
	// When using an external thunk for retpolines, we pick names that match the
	// names GCC happens to use as well. This helps simplify the implementation
	// of the thunks for kernels where they have no easy ability to create
	// aliases and are doing non-trivial configuration of the thunk's body. For
	// example, the Linux kernel will do boot-time hot patching of the thunk
	// bodies and cannot easily export aliases of these to loaded modules.
	//
	// Note that at any point in the future, we may need to change the semantics
	// of how we implement retpolines and at that time will likely change the
	// name of the called thunk. Essentially, there is no hard guarantee that
	// LLVM will generate calls to specific thunks, we merely make a best-effort
	// attempt to help out kernels and other systems where duplicating the
	// thunks is costly.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__x86_indirect_thunk_r11";
	}
	+ llvm_unreachable("unexpected reg for external indirect thunk");
	+ }
	+
	+ if (Subtarget.useRetpolineIndirectCalls() \|\|
	+ Subtarget.useRetpolineIndirectBranches()) {
	+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
	+ switch (Reg) {
	+ case X86::EAX:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_eax";
	+ case X86::ECX:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_ecx";
	+ case X86::EDX:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_edx";
	+ case X86::EDI:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_edi";
	+ case X86::R11:
	+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	+ return "__llvm_retpoline_r11";
	+ }
	llvm_unreachable("unexpected reg for retpoline");
	}

	- // When targeting an internal COMDAT thunk use an LLVM-specific name.
	- switch (Reg) {
	- case X86::EAX:
	- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	- return "__llvm_retpoline_eax";
	- case X86::ECX:
	- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	- return "__llvm_retpoline_ecx";
	- case X86::EDX:
	- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	- return "__llvm_retpoline_edx";
	- case X86::EDI:
	- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	- return "__llvm_retpoline_edi";
	- case X86::R11:
	+ if (Subtarget.useLVIControlFlowIntegrity()) {
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	- return "__llvm_retpoline_r11";
	+ return "__llvm_lvi_thunk_r11";
	}
	- llvm_unreachable("unexpected reg for retpoline");
	+ llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
	}

	MachineBasicBlock *
	-X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
	- MachineBasicBlock *BB) const {
	+X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
	+ MachineBasicBlock *BB) const {
	// Copy the virtual register into the R11 physical register and
	// call the retpoline thunk.
	DebugLoc DL = MI.getDebugLoc();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	Register CalleeVReg = MI.getOperand(0).getReg();
	- unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
	+ unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

	// Find an available scratch register to hold the callee. On 64-bit, we can
	// just use R11, but we scan for uses anyway to ensure we don't generate
	// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	// already a register use operand to the call to hold the callee. If none
	// are available, use EDI instead. EDI is chosen because EBX is the PIC base
	// register and ESI is the base pointer to realigned stack frames with VLAs.
	SmallVector<unsigned, 3> AvailableRegs;
	if (Subtarget.is64Bit())
	AvailableRegs.push_back(X86::R11);
	else
	AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

	// Zero out any registers that are already used.
	for (const auto &MO : MI.operands()) {
	if (MO.isReg() && MO.isUse())
	for (unsigned &Reg : AvailableRegs)
	if (Reg == MO.getReg())
	Reg = 0;
	}

	// Choose the first remaining non-zero available register.
	unsigned AvailableReg = 0;
	for (unsigned MaybeReg : AvailableRegs) {
	if (MaybeReg) {
	AvailableReg = MaybeReg;
	break;
	}
	}
	if (!AvailableReg)
	report_fatal_error("calling convention incompatible with retpoline, no "
	"available registers");

	- const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
	+ const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	.addReg(CalleeVReg);
	MI.getOperand(0).ChangeToES(Symbol);
	MI.setDesc(TII->get(Opc));
	MachineInstrBuilder(*BB->getParent(), &MI)
	.addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	return BB;
	}

	/// SetJmp implies future control flow change upon calling the corresponding
	/// LongJmp.
	/// Instead of using the 'return' instruction, the long jump fixes the stack and
	/// performs an indirect branch. To do so it uses the registers that were stored
	/// in the jump buffer (when calling SetJmp).
	/// In case the shadow stack is enabled we need to fix it as well, because some
	/// return addresses will be skipped.
	/// The function will save the SSP for future fixing in the function
	/// emitLongJmpShadowStackFix.
	/// \sa emitLongJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstrBuilder MIB;

	// Memory Reference.
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	// Initialize a register with zero.
	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	Register ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Write the SSP register value to offset 3 in input memory buffer.
	unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
	const int64_t SSPOffset = 3 * PVT.getStoreSize();
	const unsigned MemOpndSlot = 1;
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	MIB.addReg(SSPCopyReg);
	MIB.setMemRefs(MMOs);
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOs);

	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	emitSetJmpShadowStackFix(MI, thisMBB);
	}

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	Register FramePtr = RegInfo->getFrameRegister(*MF);
	Register BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	/// Fix the shadow stack using the previously saved SSP pointer.
	/// \sa emitSetJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	/// \return The sink MBB that will perform the future indirect branch.
	MachineBasicBlock *
	X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

	// checkSspMBB:
	// xor vreg1, vreg1
	// rdssp vreg1
	// test vreg1, vreg1
	// je sinkMBB # Jump if Shadow Stack is not supported
	// fallMBB:
	// mov buf+24/12(%rip), vreg2
	// sub vreg1, vreg2
	// jbe sinkMBB # No need to fix the Shadow Stack
	// fixShadowMBB:
	// shr 3/2, vreg2
	// incssp vreg2 # fix the SSP according to the lower 8 bits
	// shr 8, vreg2
	// je sinkMBB
	// fixShadowLoopPrepareMBB:
	// shl vreg2
	// mov 128, vreg3
	// fixShadowLoopMBB:
	// incssp vreg3
	// dec vreg2
	// jne fixShadowLoopMBB # Iterate until you finish fixing
	// # the Shadow Stack
	// sinkMBB:

	MachineFunction::iterator I = ++MBB->getIterator();
	const BasicBlock *BB = MBB->getBasicBlock();

	MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, checkSspMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, fixShadowMBB);
	MF->insert(I, fixShadowLoopPrepareMBB);
	MF->insert(I, fixShadowLoopMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
	MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MBB->addSuccessor(checkSspMBB);

	// Initialize a register with zero.
	Register ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Check whether the result of the SSP register is zero and jump directly
	// to the sink.
	unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
	BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
	.addReg(SSPCopyReg)
	.addReg(SSPCopyReg);
	BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	checkSspMBB->addSuccessor(sinkMBB);
	checkSspMBB->addSuccessor(fallMBB);

	// Reload the previously saved SSP register value.
	Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	const int64_t SPPOffset = 3 * PVT.getStoreSize();
	MachineInstrBuilder MIB =
	BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, SPPOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Subtract the current SSP from the previous SSP.
	Register SspSubReg = MRI.createVirtualRegister(PtrRC);
	unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
	.addReg(PrevSSPReg)
	.addReg(SSPCopyReg);

	// Jump to sink in case PrevSSPReg <= SSPCopyReg.
	BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
	fallMBB->addSuccessor(sinkMBB);
	fallMBB->addSuccessor(fixShadowMBB);

	// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
	unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
	unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
	Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
	.addReg(SspSubReg)
	.addImm(Offset);

	// Increase SSP when looking only on the lower 8 bits of the delta.
	unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
	BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

	// Reset the lower 8 bits.
	Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
	.addReg(SspFirstShrReg)
	.addImm(8);

	// Jump if the result of the shift is zero.
	BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	fixShadowMBB->addSuccessor(sinkMBB);
	fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

	// Do a single shift left.
	unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
	Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
	.addReg(SspSecondShrReg);

	// Save the value 128 to a register (will be used next with incssp).
	Register Value128InReg = MRI.createVirtualRegister(PtrRC);
	unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
	.addImm(128);
	fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

	// Since incssp only looks at the lower 8 bits, we might need to do several
	// iterations of incssp until we finish fixing the shadow stack.
	Register DecReg = MRI.createVirtualRegister(PtrRC);
	Register CounterReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
	.addReg(SspAfterShlReg)
	.addMBB(fixShadowLoopPrepareMBB)
	.addReg(DecReg)
	.addMBB(fixShadowLoopMBB);

	// Every iteration we increase the SSP by 128.
	BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

	// Every iteration we decrement the counter by 1.
	unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
	BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

	// Jump if the counter is not zero yet.
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
	fixShadowLoopMBB->addSuccessor(sinkMBB);
	fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	Register Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	Register SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	MachineBasicBlock *thisMBB = MBB;

	// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
	}

	// Reload FP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, LabelOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload SP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
	// the last instruction of the expansion.
	}
	MIB.setMemRefs(MMOs);

	// Jump
	BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return thisMBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MF->getFrameInfo().getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugInstr())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	Register FP = RI.getFrameRegister(*MF);
	Register BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

	if (Subtarget.is64Bit()) {
	Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
	unsigned Reg = SavedRegs[RegIdx];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	- case X86::RETPOLINE_CALL32:
	- case X86::RETPOLINE_CALL64:
	- case X86::RETPOLINE_TCRETURN32:
	- case X86::RETPOLINE_TCRETURN64:
	- return EmitLoweredRetpoline(MI, BB);
	+ case X86::INDIRECT_THUNK_CALL32:
	+ case X86::INDIRECT_THUNK_CALL64:
	+ case X86::INDIRECT_THUNK_TCRETURN32:
	+ case X86::INDIRECT_THUNK_TCRETURN64:
	+ return EmitLoweredIndirectThunk(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the EFLAGS and DF registers without them being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
	"Unexpected register in operand!");
	Push->getOperand(2).setIsUndef();
	assert(Push->getOperand(3).getReg() == X86::DF &&
	"Unexpected register in operand!");
	Push->getOperand(3).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

	// Load the old value of the control word...
	Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
	OrigCWFrameIdx);

	// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
	Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
	.addReg(OldCW, RegState::Kill).addImm(0xC00);

	// Extract to 16 bits.
	Register NewCW16 =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
	.addReg(NewCW, RegState::Kill, X86::sub_16bit);

	// Prepare memory for FLDCW.
	int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
	NewCWFrameIdx)
	.addReg(NewCW16, RegState::Kill);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), NewCWFrameIdx);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return emitXRayCustomEvent(MI, BB);

	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
	return emitXRayTypedEvent(MI, BB);

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
	while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) \|\|
	RMBBI->definesRegister(X86::EBX) \|\|
	RMBBI->definesRegister(X86::ECX) \|\|
	RMBBI->definesRegister(X86::EDX))) {
	++RMBBI;
	}
	MachineBasicBlock::iterator MBBI(RMBBI);
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	bool
	X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
	const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	// Only optimize Ands to prevent shrinking a constant that could be
	// matched by movzx.
	if (Op.getOpcode() != ISD::AND)
	return false;

	EVT VT = Op.getValueType();

	// Ignore vectors.
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();

	// Make sure the RHS really is a constant.
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;

	const APInt &Mask = C->getAPIntValue();

	// Clear all non-demanded bits initially.
	APInt ShrunkMask = Mask & Demanded;

	// Find the width of the shrunk mask.
	unsigned Width = ShrunkMask.getActiveBits();

	// If the mask is all 0s there's nothing to do here.
	if (Width == 0)
	return false;

	// Find the next power of 2 width, rounding up to a byte.
	Width = PowerOf2Ceil(std::max(Width, 8U));
	// Truncate the width to size to handle illegal types.
	Width = std::min(Width, Size);

	// Calculate a possible zero extend mask for this constant.
	APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);

	// If we aren't changing the mask, just return true to keep it and prevent
	// the caller from optimizing.
	if (ZeroExtendMask == Mask)
	return true;

	// Make sure the new mask can be represented by a combination of mask bits
	// and non-demanded bits.
	if (!ZeroExtendMask.isSubsetOf(Mask \| ~Demanded))
	return false;

	// Replace the constant with the zero extend mask.
	SDLoc DL(Op);
	SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
	Known = Known.zextOrTrunc(BitWidth, false);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSRAI:
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= VT.getScalarSizeInBits()) {
	Known.setAllZero();
	break;
	}

	Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else if (Opc == X86ISD::VSRLI) {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	} else {
	Known.Zero.ashrInPlace(ShAmt);
	Known.One.ashrInPlace(ShAmt);
	}
	break;
	}
	case X86ISD::PACKUS: {
	// PACKUS is just a truncation if the upper half is zero.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	Known.One = APInt::getAllOnesValue(BitWidth * 2);
	Known.Zero = APInt::getAllOnesValue(BitWidth * 2);

	KnownBits Known2;
	if (!!DemandedLHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	if (!!DemandedRHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}

	if (Known.countMinLeadingZeros() < BitWidth)
	Known.resetAll();
	Known = Known.trunc(BitWidth);
	break;
	}
	case X86ISD::ANDNP: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// ANDNP = (~X & Y);
	Known.One &= Known2.Zero;
	Known.Zero \|= Known2.One;
	break;
	}
	case X86ISD::FOR: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	}
	case X86ISD::PSADBW: {
	assert(VT.getScalarType() == MVT::i64 &&
	Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
	"Unexpected PSADBW types");

	// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
	Known.Zero.setBitsFrom(16);
	break;
	}
	case X86ISD::CMOV: {
	Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opc)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	Known.resetAll();
	break;
	} else if (M == SM_SentinelZero) {
	Known.One.clearAllBits();
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	Known.resetAll();
	break;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	// Known bits are the values that are shared by every demanded element.
	for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
	if (!DemandedOps[i])
	continue;
	KnownBits Known2 =
	DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VTRUNC: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcBits = Src.getScalarValueSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
	DemandedRHS);

	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
	if (!!DemandedLHS)
	Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS)
	Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::ANDNP: {
	unsigned Tmp0 =
	DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
	return std::min(Tmp0, Tmp1);
	}

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opcode)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	return 1;
	} else if (M == SM_SentinelZero) {
	// Zero = all sign bits.
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	return 1;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	unsigned Tmp0 = VTBits;
	for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
	if (!DemandedOps[i])
	continue;
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
	Tmp0 = std::min(Tmp0, Tmp1);
	}
	return Tmp0;
	}
	}
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, unsigned &Shuffle,
	MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
	if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool MatchAny = true;
	bool MatchZero = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && (MatchAny \|\| MatchZero); ++i) {
	if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
	MatchAny = MatchZero = false;
	break;
	}
	MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
	MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (MatchAny \|\| MatchZero) {
	assert(MatchZero && "Failed to match zext but matched aext?");
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

	Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
	if (SrcVT.getVectorNumElements() != NumDstElts)
	Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask<int>(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
	ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
	Mask, 0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
	if (((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) \|\|
	((MaskVT == MVT::v16i16 \|\| MaskVT == MVT::v32i8) && Subtarget.hasInt256()) \|\|
	((MaskVT == MVT::v32i16 \|\| MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
	if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
	Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteShuffle(
	MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
	const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
	ForceV2Zero, BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS, but only if it has elements that need to
	// be set to zero.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector() &&
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
	PermuteImm, Mask, Zeroable)) {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector() &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}

	return false;
	}

	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget);

	/// Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask,
	bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.isFloatingPoint() && Depth >= 1) \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);

	// Attempt to match a subvector broadcast.
	// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
	if (UnaryShuffle &&
	(BaseMaskEltSizeInBits == 128 \|\| BaseMaskEltSizeInBits == 256)) {
	SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
	if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
	SDValue Src = Inputs[0];
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(0).isUndef() &&
	Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
	MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
	return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
	Src.getValueType(),
	Src.getOperand(1)));
	}
	}
	}

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	// TODO: Should we indicate which domain is preferred if both are allowed?
	bool AllowFloatDomain = FloatDomain \|\| (Depth >= 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth >= 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt KnownUndef, KnownZero;
	resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
	APInt Zeroable = KnownUndef \| KnownZero;

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
	MaskEltSizeInBits) == 0) {
	unsigned Scale =
	cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
	MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	return DAG.getBitcast(RootVT, V1);
	}
	}

	// Attempt to match against broadcast-from-vector.
	// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
	if ((Subtarget.hasAVX2() \|\| (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
	&& (!IsEVEXShuffle \|\| NumRootElts == NumMaskElts)) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	if (V1.getValueType() == MaskVT &&
	V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	MayFoldLoad(V1.getOperand(0))) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = V1.getOperand(0);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	if (Subtarget.hasAVX2()) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
	PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT, UnaryShuffle) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteShuffle(
	MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	V2 = DAG.getBitcast(IntMaskVT, V2);
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 1)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
	AllowVariableMask &= (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	Res = DAG.getBitcast(MaskVT, V1);
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	V2 = DAG.getBitcast(ByteVT, V2);
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input shuffle then lower to VPERMV3.
	if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v2i64 \|\| MaskVT == MVT::v4f64 \|\|
	MaskVT == MVT::v4i64 \|\| MaskVT == MVT::v4f32 \|\| MaskVT == MVT::v4i32 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i16)) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v16i8 \|\| MaskVT == MVT::v32i8)))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Combine an arbitrary chain of shuffles + extract_subvectors into a single
	// instruction if possible.
	//
	// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
	// type size to attempt to combine:
	// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
	// -->
	// extract_subvector(shuffle(x,y,m2),0)
	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumMaskElts = BaseMask.size();
	unsigned NumInputs = Inputs.size();
	if (NumInputs == 0)
	return SDValue();

	SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
	SmallVector<unsigned, 4> Offsets(NumInputs, 0);

	// Peek through subvectors.
	// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
	unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
	for (unsigned i = 0; i != NumInputs; ++i) {
	SDValue &Src = WideInputs[i];
	unsigned &Offset = Offsets[i];
	Src = peekThroughBitcasts(Src);
	EVT BaseVT = Src.getValueType();
	while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(Src.getOperand(1))) {
	Offset += Src.getConstantOperandVal(1);
	Src = Src.getOperand(0);
	}
	WideSizeInBits = std::max(WideSizeInBits,
	(unsigned)Src.getValueSizeInBits());
	assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
	"Unexpected subvector extraction");
	Offset /= BaseVT.getVectorNumElements();
	Offset *= NumMaskElts;
	}

	// Bail if we're always extracting from the lowest subvectors,
	// combineX86ShuffleChain should match this for the current width.
	if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
	return SDValue();

	EVT RootVT = Root.getValueType();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned Scale = WideSizeInBits / RootSizeInBits;
	assert((WideSizeInBits % RootSizeInBits) == 0 &&
	"Unexpected subvector extraction");

	// If the src vector types aren't the same, see if we can extend
	// them to match each other.
	// TODO: Support different scalar types?
	EVT WideSVT = WideInputs[0].getValueType().getScalarType();
	if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
	return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) \|\|
	Op.getValueType().getScalarType() != WideSVT;
	}))
	return SDValue();

	for (SDValue &NewInput : WideInputs) {
	assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
	"Shuffle vector size mismatch");
	if (WideSizeInBits > NewInput.getValueSizeInBits())
	NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
	SDLoc(NewInput), WideSizeInBits);
	assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
	"Unexpected subvector extraction");
	}

	// Create new mask for larger type.
	for (unsigned i = 1; i != NumInputs; ++i)
	Offsets[i] += i * Scale * NumMaskElts;

	SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
	for (int &M : WideMask) {
	if (M < 0)
	continue;
	M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
	}
	WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
	assert(!WideInputs.empty() && "Shuffle with no inputs detected");

	if (WideInputs.size() > 2)
	return SDValue();

	// Increase depth for every upper subvector we've peeked through.
	Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

	// Attempt to combine wider chain.
	// TODO: Can we use a better Root?
	SDValue WideRoot = WideInputs[0];
	if (SDValue WideShuffle = combineX86ShuffleChain(
	WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget)) {
	WideShuffle =
	extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
	return DAG.getBitcast(RootVT, WideShuffle);
	}
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	SDLoc DL(Root);
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Attempt to create a zero vector.
	if ((UndefElts \| ZeroElts).isAllOnesValue())
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	return DAG.getBitcast(VT, CstOp);
	}

	/// Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(RootMask.size() > 0 &&
	(RootMask.size() > 1 \|\| (RootMask[0] == 0 && SrcOpIndex == 0)) &&
	"Illegal shuffle root mask");

	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	const unsigned MaxRecursionDepth = 8;
	if (Depth >= MaxRecursionDepth)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	// TODO - determine Op's demanded elts from RootMask.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	APInt OpUndef, OpZero;
	APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
	if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
	OpZero, DAG, Depth, false))
	return SDValue();

	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 16> Ops;

	// We don't need to merge masks if the root is empty.
	bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
	if (EmptyRoot) {
	// Only resolve zeros if it will remove an input, otherwise we might end
	// up in an infinite loop.
	bool ResolveKnownZeros = true;
	if (!OpZero.isNullValue()) {
	APInt UsedInputs = APInt::getNullValue(OpInputs.size());
	for (int i = 0, e = OpMask.size(); i != e; ++i) {
	int M = OpMask[i];
	if (OpUndef[i] \|\| OpZero[i] \|\| isUndefOrZero(M))
	continue;
	UsedInputs.setBit(M / OpMask.size());
	if (UsedInputs.isAllOnesValue()) {
	ResolveKnownZeros = false;
	break;
	}
	}
	}
	resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
	ResolveKnownZeros);

	Mask = OpMask;
	Ops.append(OpInputs.begin(), OpInputs.end());
	} else {
	resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

	// Add the inputs to the Ops list, avoiding duplicates.
	Ops.append(SrcOps.begin(), SrcOps.end());

	auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
	// Attempt to find an existing match.
	SDValue InputBC = peekThroughBitcasts(Input);
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (InputBC == peekThroughBitcasts(Ops[i]))
	return i;
	// Match failed - should we replace an existing Op?
	if (InsertionPoint >= 0) {
	Ops[InsertionPoint] = Input;
	return InsertionPoint;
	}
	// Add to the end of the Ops list.
	Ops.push_back(Input);
	return Ops.size() - 1;
	};

	SmallVector<int, 2> OpInputIdx;
	for (SDValue OpInput : OpInputs)
	OpInputIdx.push_back(
	AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) &&
	"Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio =
	std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	Mask.resize(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by
	// the root mask to get us all the way to the root value arrangement. The
	// reason for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) +
	(RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
	assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
	OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

	Mask[i] = OpMaskedIdx;
	}
	}

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	assert(!Ops.empty() && "Shuffle with no inputs detected");
	HasVariableMask \|= IsOpVariableMask;

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be generally combined if it either has
	// a single use (i.e. current Op) or all its users have already been combined,
	// if not then we can still combine but should prevent generation of variable
	// shuffles to avoid constant pool bloat.
	// Don't recurse if we already have more source ops than we can combine in
	// the remaining recursion depth.
	if (Ops.size() < (MaxRecursionDepth - Depth)) {
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	// For empty roots, we need to resolve zeroable elements before combining
	// them with other shuffles.
	SmallVector<int, 64> ResolvedMask = Mask;
	if (EmptyRoot)
	resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
	bool AllowVar = false;
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	AllowVar = AllowVariableMask;
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
	HasVariableMask, AllowVar, DAG, Subtarget))
	return Res;
	}
	}

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() <= 2) {
	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget);
	}

	// If that failed and any input is extracted then try to combine as a
	// shuffle with the larger type.
	return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
	HasVariableMask, AllowVariableMask,
	DAG, Subtarget);
	}

	/// Helper entry wrapper to combineX86ShufflesRecursively.
	static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /Depth/ 0,
	/HasVarMask/ false,
	/AllowVarMask/ true, DAG, Subtarget);
	}

	/// Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction.
	if (VT.getScalarSizeInBits() == 64 &&
	(Opcode == X86ISD::MOVSD \|\| Opcode == X86ISD::UNPCKH \|\|
	Opcode == X86ISD::UNPCKL)) {
	auto BC0 = peekThroughBitcasts(N.getOperand(0));
	auto BC1 = peekThroughBitcasts(N.getOperand(1));
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB \|\|
	Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	SDValue Lo, Hi;
	if (Opcode == X86ISD::MOVSD) {
	Lo = BC1.getOperand(0);
	Hi = BC0.getOperand(1);
	} else {
	Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	return DAG.getBitcast(VT, Horiz);
	}
	}

	switch (Opcode) {
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	SDValue BC = peekThroughBitcasts(Src);
	EVT SrcVT = Src.getValueType();
	EVT BCVT = BC.getValueType();

	// If broadcasting from another shuffle, attempt to simplify it.
	// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
	if (isTargetShuffle(BC.getOpcode()) &&
	VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
	unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
	SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
	SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i)
	DemandedMask[i] = i;
	if (SDValue Res = combineX86ShufflesRecursively(
	{BC}, 0, BC, DemandedMask, {}, /Depth/ 0,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getBitcast(SrcVT, Res));
	}

	// broadcast(bitcast(src)) -> bitcast(broadcast(src))
	// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
	if (Src.getOpcode() == ISD::BITCAST &&
	SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
	}

	// Reduce broadcast source vector to lowest 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	extract128BitVector(Src, 0, DAG, DL));

	// broadcast(scalar_to_vector(x)) -> broadcast(x).
	if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

	// Share broadcast with the longest vector and extract low subvector (free).
	for (SDNode *User : Src->uses())
	if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	return extractSubVector(SDValue(User, 0), 0, DAG, DL,
	VT.getSizeInBits());
	}

	// vbroadcast(scalarload X) -> vbroadcast_load X
	// For float loads, extract other uses of the scalar from the broadcast.
	if (!SrcVT.isVector() && (Src.hasOneUse() \|\| VT.isFloatingPoint()) &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceExtract = Src.hasOneUse();
	DCI.CombineTo(N.getNode(), BcastLd);
	if (NoReplaceExtract) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	} else {
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
	DAG.getIntPtrConstant(0, DL));
	DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
	}
	return N; // Return N so it doesn't get rechecked!
	}

	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
	// TODO: Handle MVT::v16i16 repeated blend mask.
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
	MVT SrcVT = N0.getOperand(0).getSimpleValueType();
	if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
	SrcVT.getScalarSizeInBits() >= 32) {
	unsigned BlendMask = N.getConstantOperandVal(2);
	unsigned Size = VT.getVectorNumElements();
	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
	N1.getOperand(0),
	DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
	}
	}
	return SDValue();
	}
	case X86ISD::VPERMI: {
	// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
	// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
	return DAG.getBitcast(VT, Res);
	}
	return SDValue();
	}
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// Canonicalize scalar FPOps:
	// MOVS(N0, OP(N0, N1)) --> MOVS(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
	// If commutable, allow OP(N1[0], N0[0]).
	unsigned Opcode1 = N1.getOpcode();
	if (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL \|\| Opcode1 == ISD::FSUB \|\|
	Opcode1 == ISD::FDIV) {
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);
	if (N10 == N0 \|\|
	(N11 == N0 && (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL))) {
	if (N10 != N0)
	std::swap(N10, N11);
	MVT SVT = VT.getVectorElementType();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
	N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
	SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	return DAG.getNode(Opcode, DL, VT, N0, SclVec);
	}
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	APInt KnownUndef1, KnownZero1;
	if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
	KnownZero1)) {
	if (KnownUndef1[SrcIdx] \|\| KnownZero1[SrcIdx]) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	int M = TargetMask1[SrcIdx];
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	APInt KnownUndef0, KnownZero0;
	if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
	KnownZero0)) {
	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (KnownUndef0[i] \|\| KnownZero0[i]) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	int M = TargetMask0[i];
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	// If we're inserting an element from a vbroadcast load, fold the
	// load into the X86insertps instruction. We need to convert the scalar
	// load to a vector and clear the source lane of the INSERTPS control.
	if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
	if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
	SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
	MemIntr->getBasePtr(),
	MemIntr->getMemOperand());
	SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
	Load),
	DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
	return Insert;
	}
	}

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return N.getOperand(0);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Checks if the shuffle mask takes subsequent elements
	/// alternately from two vectors.
	/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
	static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

	int ParitySrc[2] = {-1, -1};
	unsigned Size = Mask.size();
	for (unsigned i = 0; i != Size; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Make sure we are using the matching element from the input.
	if ((M % Size) != i)
	return false;

	// Make sure we use the same input for all elements of the same parity.
	int Src = M / Size;
	if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
	return false;
	ParitySrc[i % 2] = Src;
	}

	// Make sure each input is used.
	if (ParitySrc[0] < 0 \|\| ParitySrc[1] < 0 \|\| ParitySrc[0] == ParitySrc[1])
	return false;

	Op0Even = ParitySrc[0] == 0;
	return true;
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
	bool &IsSubAdd) {

	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasSSE3() \|\| !TLI.isTypeLegal(VT) \|\|
	!VT.getSimpleVT().isFloatingPoint())
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// Make sure we have an FADD and an FSUB.
	if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) \|\|
	(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) \|\|
	V1.getOpcode() == V2.getOpcode())
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS, RHS;
	if (V1.getOpcode() == ISD::FSUB) {
	LHS = V1->getOperand(0); RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;
	} else {
	assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
	LHS = V2->getOperand(0); RHS = V2->getOperand(1);
	if ((V1->getOperand(0) != LHS \|\| V1->getOperand(1) != RHS) &&
	(V1->getOperand(0) != RHS \|\| V1->getOperand(1) != LHS))
	return false;
	}

	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return false;

	// It's a subadd if the vector in the even parity is an FADD.
	IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
	: V2->getOpcode() == ISD::FADD;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
	static SDValue combineShuffleToFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasAnyFMA() \|\| !TLI.isTypeLegal(VT))
	return SDValue();

	// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDValue FMAdd = Op0, FMSub = Op1;
	if (FMSub.getOpcode() != X86ISD::FMSUB)
	std::swap(FMAdd, FMSub);

	if (FMAdd.getOpcode() != ISD::FMA \|\| FMSub.getOpcode() != X86ISD::FMSUB \|\|
	FMAdd.getOperand(0) != FMSub.getOperand(0) \|\| !FMAdd.hasOneUse() \|\|
	FMAdd.getOperand(1) != FMSub.getOperand(1) \|\| !FMSub.hasOneUse() \|\|
	FMAdd.getOperand(2) != FMSub.getOperand(2))
	return SDValue();

	// Check for correct shuffle mask.
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return SDValue();

	// FMAddSub takes zeroth operand from FMSub node.
	SDLoc DL(N);
	bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
	unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
	FMAdd.getOperand(2));
	}

	/// Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
	return V;

	SDValue Opnd0, Opnd1;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
	if (Opcode != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	// For a broadcast, peek through an extract element of index 0 to find the
	// horizontal op: broadcast (ext_vec_elt HOp, 0)
	EVT VT = N->getValueType(0);
	if (Opcode == X86ISD::VBROADCAST) {
	SDValue SrcOp = N->getOperand(0);
	if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	SrcOp.getValueType() == MVT::f64 &&
	SrcOp.getOperand(0).getValueType() == VT &&
	isNullConstant(SrcOp.getOperand(1)))
	N = SrcOp.getNode();
	}

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
	HOp.getOperand(0) != HOp.getOperand(1))
	return SDValue();

	// The shuffle that we are eliminating may have allowed the horizontal op to
	// have an undemanded (undefined) operand. Duplicate the other (defined)
	// operand to ensure that the results are defined across all lanes without the
	// shuffle.
	auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
	SDValue X;
	if (HorizOp.getOperand(0).isUndef()) {
	assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
	X = HorizOp.getOperand(1);
	} else if (HorizOp.getOperand(1).isUndef()) {
	assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
	X = HorizOp.getOperand(0);
	} else {
	return HorizOp;
	}
	return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
	HorizOp.getValueType(), X, X);
	};

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If a target shuffle is also
	// replicating low and high halves (and without changing the type/length of
	// the vector), we don't need the shuffle.
	if (Opcode == X86ISD::MOVDDUP \|\| Opcode == X86ISD::VBROADCAST) {
	if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
	// movddup (hadd X, X) --> hadd X, X
	// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
	assert((HOp.getValueType() == MVT::v2f64 \|\|
	HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
	return updateHOp(HOp, DAG);
	}
	return SDValue();
	}

	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (HOp.getValueSizeInBits() == 128 &&
	(isTargetShuffleEquivalent(Mask, {0, 0}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
	return updateHOp(HOp, DAG);

	if (HOp.getValueSizeInBits() == 256 &&
	(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) \|\|
	isTargetShuffleEquivalent(
	Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
	return updateHOp(HOp, DAG);

	return SDValue();
	}

	/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
	/// low half of each source vector and does not set any high half elements in
	/// the destination vector, narrow the shuffle to half its original size.
	static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
	if (!Shuf->getValueType(0).isSimple())
	return SDValue();
	MVT VT = Shuf->getSimpleValueType(0);
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// See if we can ignore all of the high elements of the shuffle.
	ArrayRef<int> Mask = Shuf->getMask();
	if (!isUndefUpperHalf(Mask))
	return SDValue();

	// Check if the shuffle mask accesses only the low half of each input vector
	// (half-index output is 0 or 2).
	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(Mask.size() / 2);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) \|\|
	(HalfIdx1 % 2 == 1) \|\| (HalfIdx2 % 2 == 1))
	return SDValue();

	// Create a half-width shuffle to replace the unnecessarily wide shuffle.
	// The trick is knowing that all of the insert/extract are actually free
	// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
	// of narrow inputs into a narrow output, and that is always cheaper than
	// the wide shuffle that we started with.
	return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
	Shuf->getOperand(1), HalfMask, HalfIdx1,
	HalfIdx2, false, DAG, /UseConcat/true);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
	if (SDValue V = narrowShuffle(Shuf, DAG))
	return V;

	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
	return HAddSub;
	}

	// Attempt to combine into a vector load/broadcast.
	if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	// Simplify source operands based on shuffle mask.
	// TODO - merge this into combineX86ShufflesRecursively.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
	return SDValue(N, 0);
	}

	// Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
	// in the upper 64 bits.
	// TODO: Can we generalize this using computeKnownBits.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
	(VT == MVT::v2f64 \|\| VT == MVT::v2i64) &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	(N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 \|\|
	N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
	SDValue In = N->getOperand(0).getOperand(0);
	switch (In.getOpcode()) {
	default:
	break;
	case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
	case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
	case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
	case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
	case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
	case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
	case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
	if (In.getOperand(0).getValueType() == MVT::v2f64 \|\|
	In.getOperand(0).getValueType() == MVT::v2i64)
	return N->getOperand(0); // return the bitcast
	break;
	case X86ISD::STRICT_CVTTP2SI:
	case X86ISD::STRICT_CVTTP2UI:
	case X86ISD::STRICT_CVTSI2P:
	case X86ISD::STRICT_CVTUI2P:
	case X86ISD::STRICT_VFPROUND:
	if (In.getOperand(1).getValueType() == MVT::v2f64 \|\|
	In.getOperand(1).getValueType() == MVT::v2i64)
	return N->getOperand(0);
	break;
	}
	}

	// Pull subvector inserts into undef through VZEXT_MOVL by making it an
	// insert into a zero vector. This helps get VZEXT_MOVL closer to
	// scalar_to_vectors where 256/512 are canonicalized to an insert and a
	// 128-bit scalar_to_vector. This reduces the number of isel patterns.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
	N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
	N->getOperand(0).hasOneUse() &&
	N->getOperand(0).getOperand(0).isUndef() &&
	isNullConstant(N->getOperand(0).getOperand(2))) {
	SDValue In = N->getOperand(0).getOperand(1);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
	getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
	Movl, N->getOperand(0).getOperand(2));
	}

	// If this a vzmovl of a full vector load, replace it with a vzload, unless
	// the load is volatile.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
	ISD::isNormalLoad(N->getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	if (LN->isSimple()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	VT.getVectorElementType(),
	LN->getPointerInfo(),
	LN->getAlignment(),
	MachineMemOperand::MOLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return VZLoad;
	}
	}

	return SDValue();
	}

	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	// Handle special case opcodes.
	switch (Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	APInt LHSUndef, LHSZero;
	APInt RHSUndef, RHSZero;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
	Depth + 1))
	return true;
	// Multiply by zero.
	KnownZero = LHSZero \| RHSZero;
	break;
	}
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA: {
	// We only need the bottom 64-bits of the (128-bit) shift amount.
	SDValue Amt = Op.getOperand(1);
	MVT AmtVT = Amt.getSimpleValueType();
	assert(AmtVT.is128BitVector() && "Unexpected value type");

	// If we reuse the shift amount just for sse shift amounts then we know that
	// only the bottom 64-bits are only ever used.
	bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
	unsigned UseOpc = Use->getOpcode();
	return (UseOpc == X86ISD::VSHL \|\| UseOpc == X86ISD::VSRL \|\|
	UseOpc == X86ISD::VSRA) &&
	Use->getOperand(0) != Amt;
	});

	APInt AmtUndef, AmtZero;
	unsigned NumAmtElts = AmtVT.getVectorNumElements();
	APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
	if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
	Depth + 1, AssumeSingleUse))
	return true;
	LLVM_FALLTHROUGH;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt SrcUndef;
	if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
	Depth + 1))
	return true;
	// TODO convert SrcUndef to KnownUndef.
	break;
	}
	case X86ISD::KSHIFTL: {
	SDValue Src = Op.getOperand(0);
	auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
	assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
	unsigned ShiftAmt = Amt->getZExtValue();

	if (ShiftAmt == 0)
	return TLO.CombineTo(Op, Src);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Src.getOpcode() == X86ISD::KSHIFTR) {
	if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
	unsigned C1 = Src.getConstantOperandVal(1);
	unsigned NewOpc = X86ISD::KSHIFTL;
	int Diff = ShiftAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	NewOpc = X86ISD::KSHIFTR;
	}

	SDLoc dl(Op);
	SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
	}
	}

	APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;

	KnownUndef <<= ShiftAmt;
	KnownZero <<= ShiftAmt;
	KnownZero.setLowBits(ShiftAmt);
	break;
	}
	case X86ISD::KSHIFTR: {
	SDValue Src = Op.getOperand(0);
	auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
	assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
	unsigned ShiftAmt = Amt->getZExtValue();

	if (ShiftAmt == 0)
	return TLO.CombineTo(Op, Src);

	// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
	// single shift. We can do this if the top bits (which are shifted
	// out) are never demanded.
	if (Src.getOpcode() == X86ISD::KSHIFTL) {
	if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
	unsigned C1 = Src.getConstantOperandVal(1);
	unsigned NewOpc = X86ISD::KSHIFTR;
	int Diff = ShiftAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	NewOpc = X86ISD::KSHIFTL;
	}

	SDLoc dl(Op);
	SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
	}
	}

	APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;

	KnownUndef.lshrInPlace(ShiftAmt);
	KnownZero.lshrInPlace(ShiftAmt);
	KnownZero.setHighBits(ShiftAmt);
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt SrcUndef, SrcZero;
	APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;

	// Aggressively peek through ops to get at the demanded elts.
	// TODO - we should do this for all target/faux shuffles ops.
	if (!DemandedElts.isAllOnesValue()) {
	APInt DemandedSrcBits =
	APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
	SDValue NewN0 = SimplifyMultipleUseDemandedBits(
	N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
	SDValue NewN1 = SimplifyMultipleUseDemandedBits(
	N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
	if (NewN0 \|\| NewN1) {
	NewN0 = NewN0 ? NewN0 : N0;
	NewN1 = NewN1 ? NewN1 : N1;
	return TLO.CombineTo(Op,
	TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
	}
	}
	break;
	}
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	APInt DemandedLHS, DemandedRHS;
	getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;
	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownZero = SrcZero.zextOrTrunc(NumElts);
	KnownUndef = SrcUndef.zextOrTrunc(NumElts);
	break;
	}
	case X86ISD::BLENDV: {
	APInt SelUndef, SelZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
	SelZero, TLO, Depth + 1))
	return true;

	// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;

	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;

	KnownZero = LHSZero & RHSZero;
	KnownUndef = LHSUndef & RHSUndef;
	break;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;
	// Don't bother broadcasting if we just need the 0'th element.
	if (DemandedElts == 1) {
	if (Src.getValueType() != VT)
	Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
	SDLoc(Op));
	return TLO.CombineTo(Op, Src);
	}
	APInt SrcUndef, SrcZero;
	APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::VPERMV: {
	SDValue Mask = Op.getOperand(0);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PSHUFB:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMILPV: {
	SDValue Mask = Op.getOperand(1);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::VPPERM:
	case X86ISD::VPERMIL2: {
	SDValue Mask = Op.getOperand(2);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	}

	// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
	// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
	// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	DemandedElts.lshr(NumElts / 2) == 0) {
	unsigned SizeInBits = VT.getSizeInBits();
	unsigned ExtSizeInBits = SizeInBits / 2;

	// See if 512-bit ops only use the bottom 128-bits.
	if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
	ExtSizeInBits = SizeInBits / 4;

	switch (Opc) {
	// Zero upper elements.
	case X86ISD::VZEXT_MOVL: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	// Subvector broadcast.
	case X86ISD::SUBV_BROADCAST: {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	if (Src.getValueSizeInBits() > ExtSizeInBits)
	Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
	else if (Src.getValueSizeInBits() < ExtSizeInBits) {
	MVT SrcSVT = Src.getSimpleValueType().getScalarType();
	MVT SrcVT =
	MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
	Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
	}
	return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
	TLO.DAG, DL, ExtSizeInBits));
	}
	// Byte shifts by immediate.
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	// Shift by uniform.
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA:
	// Shift by immediate.
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	case X86ISD::VPERMI: {
	// Simplify PERMPD/PERMQ to extract_subvector.
	// TODO: This should be done in shuffle combining.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64) {
	SmallVector<int, 4> Mask;
	DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
	if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
	SDLoc DL(Op);
	SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
	return TLO.CombineTo(Op, Insert);
	}
	}
	break;
	}
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// Saturated Packs.
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	// Horizontal Ops.
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	SDLoc DL(Op);
	MVT ExtVT = VT.getSimpleVT();
	ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
	ExtSizeInBits / ExtVT.getScalarSizeInBits());
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue Ext1 =
	extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	}
	}

	// Get target/faux shuffle mask.
	APInt OpUndef, OpZero;
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
	OpZero, TLO.DAG, Depth, false))
	return false;

	// Shuffle inputs must be the same size as the result.
	if (OpMask.size() != (unsigned)NumElts \|\|
	llvm::any_of(OpInputs, [VT](SDValue V) {
	return VT.getSizeInBits() != V.getValueSizeInBits() \|\|
	!V.getValueType().isVector();
	}))
	return false;

	KnownZero = OpZero;
	KnownUndef = OpUndef;

	// Check if shuffle mask can be simplified to undef/zero/identity.
	int NumSrcs = OpInputs.size();
	for (int i = 0; i != NumElts; ++i)
	if (!DemandedElts[i])
	OpMask[i] = SM_SentinelUndef;

	if (isUndefInRange(OpMask, 0, NumElts)) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}
	if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
	KnownZero.setAllBits();
	return TLO.CombineTo(
	Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
	}
	for (int Src = 0; Src != NumSrcs; ++Src)
	if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

	// Attempt to simplify inputs.
	for (int Src = 0; Src != NumSrcs; ++Src) {
	// TODO: Support inputs of different types.
	if (OpInputs[Src].getValueType() != VT)
	continue;

	int Lo = Src * NumElts;
	APInt SrcElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	int M = OpMask[i] - Lo;
	if (0 <= M && M < NumElts)
	SrcElts.setBit(M);
	}

	// TODO - Propagate input undef/zero elts.
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;
	}

	// If we don't demand all elements, then attempt to combine to a simpler
	// shuffle.
	// TODO: Handle other depths, but first we need to handle the fact that
	// it might combine to the same shuffle.
	if (!DemandedElts.isAllOnesValue() && Depth == 0) {
	SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	DemandedMask[i] = i;

	SDValue NewShuffle = combineX86ShufflesRecursively(
	{Op}, 0, Op, DemandedMask, {}, Depth, /HasVarMask/ false,
	/AllowVarMask/ true, TLO.DAG, Subtarget);
	if (NewShuffle)
	return TLO.CombineTo(Op, NewShuffle);
	}

	return false;
	}

	bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
	SDValue Op, const APInt &OriginalDemandedBits,
	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = OriginalDemandedBits.getBitWidth();
	unsigned Opc = Op.getOpcode();
	switch(Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	KnownBits KnownOp;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// FIXME: Can we bound this better?
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;

	// Aggressively peek through ops to get at the demanded low bits.
	SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
	LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
	SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
	RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
	if (DemandedLHS \|\| DemandedRHS) {
	DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
	DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
	}
	break;
	}
	case X86ISD::VSHLI: {
	SDValue Op0 = Op.getOperand(0);

	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Op0.getOpcode() == X86ISD::VSRLI &&
	OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
	unsigned Shift2Amt = Op0.getConstantOperandVal(1);
	if (Shift2Amt < BitWidth) {
	int Diff = ShAmt - Shift2Amt;
	if (Diff == 0)
	return TLO.CombineTo(Op, Op0.getOperand(0));

	unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
	SDValue NewShift = TLO.DAG.getNode(
	NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
	TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
	return TLO.CombineTo(Op, NewShift);
	}
	}

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;

	// Low bits known zero.
	Known.Zero.setLowBits(ShAmt);
	break;
	}
	case X86ISD::VSRLI: {
	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
	OriginalDemandedElts, Known, TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// High bits known zero.
	Known.Zero.setHighBits(ShAmt);
	break;
	}
	case X86ISD::VSRAI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	// If we just want the sign bit then we don't need to shift it.
	if (OriginalDemandedBits.isSignMask())
	return TLO.CombineTo(Op, Op0);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Op0.getOpcode() == X86ISD::VSHLI &&
	Op.getOperand(1) == Op0.getOperand(1)) {
	SDValue Op00 = Op0.getOperand(0);
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
	if (ShAmt < NumSignBits)
	return TLO.CombineTo(Op, Op00);
	}

	// If any of the demanded bits are produced by the sign extension, we also
	// demand the input sign bit.
	if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
	DemandedMask.setSignBit();

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	if (Known.Zero[BitWidth - ShAmt - 1] \|\|
	OriginalDemandedBits.countLeadingZeros() >= ShAmt)
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

	// High bits are known one.
	if (Known.One[BitWidth - ShAmt - 1])
	Known.One.setHighBits(ShAmt);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumVecElts = VecVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
	unsigned Idx = CIdx->getZExtValue();
	unsigned VecBitWidth = VecVT.getScalarSizeInBits();

	// If we demand no bits from the vector then we must have demanded
	// bits from the implict zext - simplify to zero.
	APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
	if (DemandedVecBits == 0)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	APInt KnownUndef, KnownZero;
	APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
	if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	KnownBits KnownVec;
	if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	if (SDValue V = SimplifyMultipleUseDemandedBits(
	Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

	Known = KnownVec.zext(BitWidth, true);
	return false;
	}
	break;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();

	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
	unsigned Idx = CIdx->getZExtValue();
	if (!OriginalDemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);

	KnownBits KnownVec;
	APInt DemandedVecElts(OriginalDemandedElts);
	DemandedVecElts.clearBit(Idx);
	if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	KnownBits KnownScl;
	unsigned NumSclBits = Scl.getScalarValueSizeInBits();
	APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
	if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
	return true;

	KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
	Known.One = KnownVec.One & KnownScl.One;
	Known.Zero = KnownVec.Zero & KnownScl.Zero;
	return false;
	}
	break;
	}
	case X86ISD::PACKSS:
	// PACKSS saturates to MIN/MAX integer values. So if we just want the
	// sign bit then we can just ask for the source operands sign bit.
	// TODO - add known bits handling.
	if (OriginalDemandedBits.isSignMask()) {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

	KnownBits KnownLHS, KnownRHS;
	APInt SignMask = APInt::getSignMask(BitWidth * 2);
	if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
	KnownLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
	KnownRHS, TLO, Depth + 1))
	return true;
	}
	// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (OriginalDemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return TLO.CombineTo(Op, Op.getOperand(1));
	break;
	case X86ISD::MOVMSK: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// If we don't need the sign bits at all just return zero.
	if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	// Only demand the vector elements of the sign bits we need.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
	if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	TLO, Depth + 1))
	return true;

	Known.Zero = KnownZero.zextOrSelf(BitWidth);
	Known.Zero.setHighBits(BitWidth - NumElts);

	// MOVMSK only uses the MSB from each vector element.
	KnownBits KnownSrc;
	if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
	KnownSrc, TLO, Depth + 1))
	return true;

	if (KnownSrc.One[SrcBits - 1])
	Known.One.setLowBits(NumElts);
	else if (KnownSrc.Zero[SrcBits - 1])
	Known.Zero.setLowBits(NumElts);
	return false;
	}
	}

	return TargetLowering::SimplifyDemandedBitsForTargetNode(
	Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
	}

	SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	SelectionDAG &DAG, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	switch (Opc) {
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	// If we don't demand the inserted element, return the base vector.
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();
	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
	!DemandedElts[CIdx->getZExtValue()])
	return Vec;
	break;
	}
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (DemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return Op.getOperand(1);
	break;
	}

	APInt ShuffleUndef, ShuffleZero;
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
	ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
	// If all the demanded elts are from one operand and are inline,
	// then we can use the operand directly.
	int NumOps = ShuffleOps.size();
	if (ShuffleMask.size() == (unsigned)NumElts &&
	llvm::all_of(ShuffleOps, [VT](SDValue V) {
	return VT.getSizeInBits() == V.getValueSizeInBits();
	})) {

	if (DemandedElts.isSubsetOf(ShuffleUndef))
	return DAG.getUNDEF(VT);
	if (DemandedElts.isSubsetOf(ShuffleUndef \| ShuffleZero))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

	// Bitmask that indicates which ops have only been accessed 'inline'.
	APInt IdentityOp = APInt::getAllOnesValue(NumOps);
	for (int i = 0; i != NumElts; ++i) {
	int M = ShuffleMask[i];
	if (!DemandedElts[i] \|\| ShuffleUndef[i])
	continue;
	int Op = M / NumElts;
	int Index = M % NumElts;
	if (M < 0 \|\| Index != i) {
	IdentityOp.clearAllBits();
	break;
	}
	IdentityOp &= APInt::getOneBitSet(NumOps, Op);
	if (IdentityOp == 0)
	break;
	}
	assert((IdentityOp == 0 \|\| IdentityOp.countPopulation() == 1) &&
	"Multiple identity shuffles detected");

	if (IdentityOp != 0)
	return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
	}
	}

	return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
	Op, DemandedBits, DemandedElts, DAG, Depth);
	}

	// Helper to peek through bitops/setcc to determine size of source vector.
	// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
	static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return Src.getOperand(0).getValueSizeInBits() == Size;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
	checkBitcastSrcVectorSize(Src.getOperand(1), Size);
	}
	return false;
	}

	// Helper to push sign extension of vXi1 SETCC result through bitops.
	static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
	SDValue Src, const SDLoc &DL) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return DAG.getNode(
	Src.getOpcode(), DL, SExtVT,
	signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
	signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
	}
	llvm_unreachable("Unexpected node type for vXi1 sign extension");
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
	const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isSimple() \|\| SrcVT.getScalarType() != MVT::i1)
	return SDValue();

	// If the input is a truncate from v16i8 or v32i8 go ahead and use a
	// movmskb even with avx512. This will be better than truncating to vXi1 and
	// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
	// vpcmpeqb/vpcmpgtb.
	bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
	(Src.getOperand(0).getValueType() == MVT::v16i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v32i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v64i8);

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (!Subtarget.hasSSE2() \|\| (Subtarget.hasAVX512() && !IsTruncated))
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	bool PropagateSExt = false;
	switch (SrcVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
	SExtVT = MVT::v4i64;
	PropagateSExt = true;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) \|\|
	checkBitcastSrcVectorSize(Src, 512))) {
	SExtVT = MVT::v8i32;
	PropagateSExt = true;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	case MVT::v64i1:
	// If we have AVX512F, but not AVX512BW and the input is truncated from
	// v64i8 checked earlier. Then split the input and make two pmovmskbs.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
	SExtVT = MVT::v64i8;
	break;
	}
	return SDValue();
	};

	SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
	: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

	if (SExtVT == MVT::v16i8 \|\| SExtVT == MVT::v32i8 \|\| SExtVT == MVT::v64i8) {
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	} else {
	if (SExtVT == MVT::v8i16)
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	EVT IntVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
	V = DAG.getZExtOrTrunc(V, DL, IntVT);
	return DAG.getBitcast(VT, V);
	}

	// Convert a vXi1 constant build vector to the same width scalar integer.
	static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
	EVT SrcVT = Op.getValueType();
	assert(SrcVT.getVectorElementType() == MVT::i1 &&
	"Expected a vXi1 vector");
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	"Expected a constant build vector");

	APInt Imm(SrcVT.getVectorNumElements(), 0);
	for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
	SDValue In = Op.getOperand(Idx);
	if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
	Imm.setBit(Idx);
	}
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
	return DAG.getConstant(Imm, SDLoc(Op), IntVT);
	}

	static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// Only do this if we have k-registers.
	if (!Subtarget.hasAVX512())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDValue Op = N->getOperand(0);
	EVT SrcVT = Op.getValueType();

	if (!Op.hasOneUse())
	return SDValue();

	// Look for logic ops.
	if (Op.getOpcode() != ISD::AND &&
	Op.getOpcode() != ISD::OR &&
	Op.getOpcode() != ISD::XOR)
	return SDValue();

	// Make sure we have a bitcast between mask registers and a scalar type.
	if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	DstVT.isScalarInteger()) &&
	!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
	SrcVT.isScalarInteger()))
	return SDValue();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
	DAG.getBitcast(DstVT, RHS));

	if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

	// If the RHS is a vXi1 build vector, this is a good reason to flip too.
	// Most of these have to move a constant from the scalar domain anyway.
	if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS);
	}

	return SDValue();
	}

	static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(BV);
	unsigned NumElts = BV->getNumOperands();
	SDValue Splat = BV->getSplatValue();

	// Build MMX element from integer GPR or SSE float values.
	auto CreateMMXElement = [&](SDValue V) {
	if (V.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);
	if (V.getValueType().isFloatingPoint()) {
	if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
	V = DAG.getBitcast(MVT::v2i64, V);
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
	}
	V = DAG.getBitcast(MVT::i32, V);
	} else {
	V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
	}
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
	};

	// Convert build vector ops to MMX data in the bottom elements.
	SmallVector<SDValue, 8> Ops;

	// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
	if (Splat) {
	if (Splat.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);

	Splat = CreateMMXElement(Splat);

	if (Subtarget.hasSSE1()) {
	// Unpack v8i8 to splat i8 elements to lowest 16-bits.
	if (NumElts == 8)
	Splat = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
	Splat);

	// Use PSHUFW to repeat 16-bit elements.
	unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
	Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
	}
	Ops.append(NumElts, Splat);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Ops.push_back(CreateMMXElement(BV->getOperand(i)));
	}

	// Use tree of PUNPCKLs to build up general MMX vector.
	while (Ops.size() > 1) {
	unsigned NumOps = Ops.size();
	unsigned IntrinOp =
	(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
	: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
	: Intrinsic::x86_mmx_punpcklbw));
	SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
	for (unsigned i = 0; i != NumOps; i += 2)
	Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
	Ops[i], Ops[i + 1]);
	Ops.resize(NumOps / 2);
	}

	return Ops[0];
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	SDLoc dl(N);
	if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
	return V;

	// Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
	// legalization destroys the v4i32 type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
	VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
	N0.getOperand(0).getValueType() == MVT::v4i32 &&
	ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
	cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
	SDValue N00 = N0.getOperand(0);
	// Only do this if we can avoid scalarizing the input.
	if (ISD::isNormalLoad(N00.getNode()) \|\|
	(N00.getOpcode() == ISD::BITCAST &&
	N00.getOperand(0).getValueType() == MVT::v4f32)) {
	SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
	DAG.getBitcast(MVT::v4f32, N00));
	return DAG.getZExtOrTrunc(V, dl, VT);
	}
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	// Use zeros for the widening if we already have some zeroes. This can
	// allow SimplifyDemandedBits to remove scalar ANDs that may be down
	// stream of this.
	// FIXME: It might make sense to detect a concat_vectors with a mix of
	// zeroes and undef and turn it into insert_subvector for i1 vectors as
	// a separate combine. What we can't do is canonicalize the operands of
	// such a concat or we'll get into a loop with SimplifyDemandedBits.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
	SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
	if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
	SrcVT = LastOp.getValueType();
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
	Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
	// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
	// due to insert_subvector legalization on KNL. By promoting the copy to i16
	// we can help with known bits propagation from the vXi1 domain to the
	// scalar domain.
	if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
	!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == MVT::v16i1 &&
	isNullConstant(N0.getOperand(1)))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
	DAG.getBitcast(MVT::i16, N0.getOperand(0)));

	// Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
	// determines // the number of bits loaded. Remaining bits are zero.
	if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
	VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
	auto *BCast = cast<MemIntrinsicSDNode>(N0);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
	VT.getVectorElementType(),
	BCast->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
	return ResNode;
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.
	if (VT == MVT::x86mmx) {
	// Detect MMX constant vectors.
	APInt UndefElts;
	SmallVector<APInt, 1> EltBits;
	if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
	SDLoc DL(N0);
	// Handle zero-extension of i32 with MOVD.
	if (EltBits[0].countLeadingZeros() >= 32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
	DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
	// Else, bitcast to a double.
	// TODO - investigate supporting sext 32-bit immediates on x86_64.
	APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
	return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
	}

	// Detect bitcasts to x86mmx low word.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8) &&
	N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
	bool LowUndef = true, AllUndefOrZero = true;
	for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N0.getOperand(i);
	LowUndef &= Op.isUndef() \|\| (i >= e/2);
	AllUndefOrZero &= (Op.isUndef() \|\| isNullConstant(Op));
	}
	if (AllUndefOrZero) {
	SDValue N00 = N0.getOperand(0);
	SDLoc dl(N00);
	N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
	: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
	return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
	}
	}

	// Detect bitcasts of 64-bit build vectors and convert to a
	// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
	// lowest element.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2f32 \|\| SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\|
	SrcVT == MVT::v8i8))
	return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}
	}

	// Try to remove a bitcast of constant vXi1 vector. We have to legalize
	// most of these to scalar anyway.
	if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
	SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	return combinevXi1ConstantToInteger(N0, DAG);
	}

	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isa<ConstantSDNode>(N0)) {
	auto *C = cast<ConstantSDNode>(N0);
	if (C->isAllOnesValue())
	return DAG.getConstant(1, SDLoc(N0), VT);
	if (C->isNullValue())
	return DAG.getConstant(0, SDLoc(N0), VT);
	}

	// Try to remove bitcasts from input and output of mask arithmetic to
	// remove GPR<->K-register crossings.
	if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
	return V;

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Given a ABS node, detect the following pattern:
	// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
	SDValue AbsOp1 = Abs->getOperand(0);
	if (AbsOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = AbsOp1.getOperand(0);
	Op1 = AbsOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
	auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
	};
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
	PSADBWBuilder);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Src = DAG.matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	unsigned NumElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = NumElts / 2;
	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
	unsigned SubSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
	SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
	return SDValue();

	// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match && ExtractVT == MVT::i1)
	Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	SDValue Movmsk;
	SDLoc DL(Extract);
	EVT MatchVT = Match.getValueType();
	unsigned NumElts = MatchVT.getVectorNumElements();
	unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (ExtractVT == MVT::i1) {
	// Special case for (pre-legalization) vXi1 reductions.
	if (NumElts > 64 \|\| !isPowerOf2_32(NumElts))
	return SDValue();
	if (TLI.isTypeLegal(MatchVT)) {
	// If this is a legal AVX512 predicate type then we can just bitcast.
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = DAG.getBitcast(MovmskVT, Match);
	} else {
	// Use combineBitcastvxi1 to create the MOVMSK.
	while (NumElts > MaxElts) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	NumElts /= 2;
	}
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
	}
	if (!Movmsk)
	return SDValue();
	Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
	} else {
	// Bail with AVX512VL (which uses predicate registers).
	if (Subtarget.hasVLX())
	return SDValue();

	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 && Subtarget.hasAVX())))
	return SDValue();

	// Make sure this isn't a vector of 1 element. The perf win from using
	// MOVMSK diminishes with less elements in the reduction, but it is
	// generally better to get the comparison over to the GPRs as soon as
	// possible to reduce the number of vector ops.
	if (Match.getValueType().getVectorNumElements() < 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	MatchSizeInBits = Match.getValueSizeInBits();
	}

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskSrcVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
	Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
	NumElts = MaskSrcVT.getVectorNumElements();
	}
	assert((NumElts <= 32 \|\| NumElts == 64) &&
	"Not expecting more than 64 elements");

	MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
	if (BinOp == ISD::XOR) {
	// parity -> (AND (CTPOP(MOVMSK X)), 1)
	SDValue Mask = DAG.getConstant(1, DL, CmpVT);
	SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
	Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
	return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
	}

	SDValue CmpC;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CmpC = DAG.getConstant(0, DL, CmpVT);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
	DL, CmpVT);
	CondCode = ISD::CondCode::SETEQ;
	}

	// The setcc produces an i8 of 0/1, so extend that to the result width and
	// negate to get the final 0/-1 mask value.
	EVT SetccVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
	SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
	SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
	SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
	return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.useBWIRegs())
	RegSize = 512;
	else if (Subtarget.hasAVX())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	ISD::NodeType BinOp;
	SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| Root.getOpcode() != ISD::ABS)
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc dl(N);
	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue SrcBC = peekThroughBitcasts(Src);

	// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
	if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
	SDValue SrcOp = SrcBC.getOperand(0);
	if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, SrcOp);
	}

	// If we're extracting a single element from a broadcast load and there are
	// no other users, just create a single load.
	if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
	unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
	if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
	VT.getSizeInBits() == SrcBCWidth) {
	SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
	MemIntr->getBasePtr(),
	MemIntr->getPointerInfo(),
	MemIntr->getAlignment(),
	MemIntr->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
	return Load;
	}
	}

	// Handle extract(truncate(x)) for 0'th index.
	// TODO: Treat this as a faux shuffle?
	// TODO: When can we use this for general indices?
	if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
	isNullConstant(Idx)) {
	Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
	Src = DAG.getBitcast(SrcVT, Src);
	return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
	}

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask<int>(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	// Simplify Mask based on demanded element.
	int ExtractIdx = (int)N->getConstantOperandVal(1);
	int Scale = Mask.size() / NumSrcElts;
	int Lo = Scale * ExtractIdx;
	int Hi = Scale * (ExtractIdx + 1);
	for (int i = 0, e = (int)Mask.size(); i != e; ++i)
	if (i < Lo \|\| Hi <= i)
	Mask[i] = SM_SentinelUndef;

	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Extracting a scalar FP value from vector element 0 is free, so extract each
	/// operand first, then perform the math as a scalar op.
	static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
	SDValue Vec = ExtElt->getOperand(0);
	SDValue Index = ExtElt->getOperand(1);
	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Vec.getValueType();

	// TODO: If this is a unary/expensive/expand op, allow extraction from a
	// non-zero element because the shuffle+scalar op will be cheaper?
	if (!Vec.hasOneUse() \|\| !isNullConstant(Index) \|\| VecVT.getScalarType() != VT)
	return SDValue();

	// Vector FP compares don't fit the pattern of FP math ops (propagate, not
	// extract, the condition code), so deal with those as a special-case.
	if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
	EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
	if (OpVT != MVT::f32 && OpVT != MVT::f64)
	return SDValue();

	// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(1), Index);
	return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
	}

	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Vector FP selects don't fit the pattern of FP math ops (because the
	// condition has a different type and we have to change the opcode), so deal
	// with those here.
	// FIXME: This is restricted to pre type legalization by ensuring the setcc
	// has i1 elements. If we loosen this we need to convert vector bool to a
	// scalar bool.
	if (Vec.getOpcode() == ISD::VSELECT &&
	Vec.getOperand(0).getOpcode() == ISD::SETCC &&
	Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
	Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
	// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	Vec.getOperand(0).getValueType().getScalarType(),
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(1), Index);
	SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(2), Index);
	return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
	}

	// TODO: This switch could include FNEG and the x86-specific FP logic ops
	// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
	// missed load folding and fma+fneg combining.
	switch (Vec.getOpcode()) {
	case ISD::FMA: // Begin 3 operands
	case ISD::FMAD:
	case ISD::FADD: // Begin 2 operands
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FCOPYSIGN:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMAXIMUM:
	case ISD::FMINIMUM:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case ISD::FABS: // Begin 1 operand
	case ISD::FSQRT:
	case ISD::FRINT:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FFLOOR:
	case X86ISD::FRCP:
	case X86ISD::FRSQRT: {
	// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
	SDLoc DL(ExtElt);
	SmallVector<SDValue, 4> ExtOps;
	for (SDValue Op : Vec->ops())
	ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
	return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
	}
	default:
	return SDValue();
	}
	llvm_unreachable("All opcodes should return within switch");
	}

	/// Try to convert a vector reduction sequence composed of binops and shuffles
	/// into horizontal ops.
	static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");

	// We need at least SSE2 to anything here.
	if (!Subtarget.hasSSE2())
	return SDValue();

	ISD::NodeType Opc;
	SDValue Rdx =
	DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
	if (!Rdx)
	return SDValue();

	SDValue Index = ExtElt->getOperand(1);
	assert(isNullConstant(Index) &&
	"Reduction doesn't end in an extract from index 0");

	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Rdx.getValueType();
	if (VecVT.getScalarType() != VT)
	return SDValue();

	SDLoc DL(ExtElt);

	// vXi8 reduction - sub 128-bit vector.
	if (VecVT == MVT::v4i8 \|\| VecVT == MVT::v8i8) {
	if (VecVT == MVT::v4i8) {
	// Pad with zero.
	if (Subtarget.hasSSE41()) {
	Rdx = DAG.getBitcast(MVT::i32, Rdx);
	Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
	DAG.getConstant(0, DL, MVT::v4i32), Rdx,
	DAG.getIntPtrConstant(0, DL));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	} else {
	Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
	DAG.getConstant(0, DL, VecVT));
	}
	}
	if (Rdx.getValueType() == MVT::v8i8) {
	// Pad with undef.
	Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
	DAG.getUNDEF(MVT::v8i8));
	}
	Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
	DAG.getConstant(0, DL, MVT::v16i8));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	// Must be a >=128-bit vector with pow2 elements.
	if ((VecVT.getSizeInBits() % 128) != 0 \|\|
	!isPowerOf2_32(VecVT.getVectorNumElements()))
	return SDValue();

	// vXi8 reduction - sum lo/hi halves then use PSADBW.
	if (VT == MVT::i8) {
	while (Rdx.getValueSizeInBits() > 128) {
	unsigned HalfSize = VecVT.getSizeInBits() / 2;
	unsigned HalfElts = VecVT.getVectorNumElements() / 2;
	SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
	SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
	Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
	VecVT = Rdx.getValueType();
	}
	assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");

	SDValue Hi = DAG.getVectorShuffle(
	MVT::v16i8, DL, Rdx, Rdx,
	{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
	Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
	Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
	getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
	return SDValue();

	unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

	// 256-bit horizontal instructions operate on 128-bit chunks rather than
	// across the whole vector, so we need an extract + hop preliminary stage.
	// This is the only step where the operands of the hop are not the same value.
	// TODO: We could extend this to handle 512-bit or even longer vectors.
	if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|
	((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
	unsigned NumElts = VecVT.getVectorNumElements();
	SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
	SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
	Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
	VecVT = Rdx.getValueType();
	}
	if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
	!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
	return SDValue();

	// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
	unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
	for (unsigned i = 0; i != ReductionSteps; ++i)
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);
	bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

	// Integer Constant Folding.
	if (CIdx && VT.isInteger()) {
	APInt UndefVecElts;
	SmallVector<APInt, 16> EltBits;
	unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
	if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
	EltBits, true, false)) {
	uint64_t Idx = CIdx->getZExtValue();
	if (UndefVecElts[Idx])
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
	return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
	dl, VT);
	}
	}

	if (IsPextr) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(
	SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
	return SDValue(N, 0);

	// PEXTR(PINSR(v, s, c), c) -> s (with implicit zext handling).
	if ((InputVector.getOpcode() == X86ISD::PINSRB \|\|
	InputVector.getOpcode() == X86ISD::PINSRW) &&
	InputVector.getOperand(2) == EltIdx) {
	assert(SrcVT == InputVector.getOperand(0).getValueType() &&
	"Vector type mismatch");
	SDValue Scl = InputVector.getOperand(1);
	Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
	return DAG.getZExtOrTrunc(Scl, dl, VT);
	}

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	return SDValue();
	}

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
	return V;

	if (SDValue V = scalarizeExtEltFP(N, DAG))
	return V;

	// Attempt to extract a i1 element by using MOVMSK to extract the signbits
	// and then testing the relevant element.
	if (CIdx && SrcVT.getScalarType() == MVT::i1) {
	SmallVector<SDNode *, 16> BoolExtracts;
	auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
	if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Use->getOperand(1)) &&
	Use->getValueType(0) == MVT::i1) {
	BoolExtracts.push_back(Use);
	return true;
	}
	return false;
	};
	if (all_of(InputVector->uses(), IsBoolExtract) &&
	BoolExtracts.size() > 1) {
	EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
	if (SDValue BC =
	combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
	for (SDNode *Use : BoolExtracts) {
	// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
	unsigned MaskIdx = Use->getConstantOperandVal(1);
	APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
	SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
	SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
	Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
	DCI.CombineTo(Use, Res);
	}
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
	// TODO: Can we assert that both operands are not zeros (because that should
	// get simplified at node creation time)?
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// If both inputs are 0/undef, create a complete zero vector.
	// FIXME: As noted above this should be handled by DAGCombiner/getNode.
	if (TValIsAllZeros && FValIsAllZeros) {
	if (VT.isFloatingPoint())
	return DAG.getConstantFP(0.0, DL, VT);
	return DAG.getConstant(0, DL, VT);
	}

	if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
	Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s. Only do this if the condition has one use.
	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC = ISD::getSetCCInverse(
	cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	/// If both arms of a vector select are concatenated vectors, split the select,
	/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
	/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
	/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
	static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
	return SDValue();

	// TODO: Split 512-bit vectors too?
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	// TODO: Split as long as any 2 of the 3 operands are concatenated?
	SDValue Cond = N->getOperand(0);
	SDValue TVal = N->getOperand(1);
	SDValue FVal = N->getOperand(2);
	SmallVector<SDValue, 4> CatOpsT, CatOpsF;
	if (!TVal.hasOneUse() \|\| !FVal.hasOneUse() \|\|
	!collectConcatOps(TVal.getNode(), CatOpsT) \|\|
	!collectConcatOps(FVal.getNode(), CatOpsF))
	return SDValue();

	auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
	makeBlend, /CheckBWI/ false);
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	/// If this is a dynamic select (non-constant condition) and we can match
	/// this node with one of the variable blend instructions, restructure the
	/// condition so that blends can use the high (sign) bit of each element.
	/// This function will also call SimplifyDemandedBits on already created
	/// BLENDV to perform additional simplifications.
	static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	if ((N->getOpcode() != ISD::VSELECT &&
	N->getOpcode() != X86ISD::BLENDV) \|\|
	ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();

	// Don't optimize before the condition has been transformed to a legal type
	// and don't ever optimize vector selects that map to AVX512 mask-registers.
	unsigned BitWidth = Cond.getScalarValueSizeInBits();
	if (BitWidth < 8 \|\| BitWidth > 64)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = N->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	auto OnlyUsedAsSelectCond = [](SDValue Cond) {
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI)
	if ((UI->getOpcode() != ISD::VSELECT &&
	UI->getOpcode() != X86ISD::BLENDV) \|\|
	UI.getOperandNo() != 0)
	return false;

	return true;
	};

	if (OnlyUsedAsSelectCond(Cond)) {
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
	return SDValue();

	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Update all the nodes so that we do not use
	// the generic VSELECT anymore. Otherwise, we may perform wrong
	// optimizations as we messed with the actual expectation for the vector
	// boolean values.
	for (SDNode *U : Cond->uses()) {
	if (U->getOpcode() == X86ISD::BLENDV)
	continue;

	SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
	Cond, U->getOperand(1), U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	DCI.AddToWorklist(U);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	// Otherwise we can still at least try to simplify multiple use bits.
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,
	DemandedElts, DAG, 0))
	return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
	V, N->getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of:
	// (select M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoConditionalNegate(
	EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	EVT MaskVT = Mask.getValueType();
	assert(MaskVT.isInteger() &&
	DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
	"Mask must be zero/all-bits");

	if (X.getValueType() != MaskVT \|\| Y.getValueType() != MaskVT)
	return SDValue();
	if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
	return SDValue();

	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};

	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;
	else
	return SDValue();

	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);

	// Try simplification again because we use this function to optimize
	// BLENDV nodes that are not handled by the generic combiner.
	if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
	return V;

	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

	// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
	// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
	// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
	if (CondVT.isVector() && CondVT.isInteger() &&
	CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
	(!CondConstantVector \|\| CondVT.getScalarType() == MVT::i8) &&
	DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
	if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
	DL, DAG, Subtarget))
	return V;

	// Convert vselects with constant condition into shuffles.
	if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
	SmallVector<int, 64> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
	}

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) &&
	!DAG.isKnownNeverZeroFloat(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// Some mask scalar intrinsics rely on checking if only one bit is set
	// and implement it in C code like this:
	// A[0] = (U & 1) ? A[0] : W[0];
	// This creates some redundant instructions that break pattern matching.
	// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
	if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
	Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 \|\| VT == MVT::f64)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	SDValue AndNode = Cond.getOperand(0);
	if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	isOneConstant(AndNode.getOperand(1))) {
	// LHS and RHS swapped due to
	// setcc outputting 1 when AND resulted in 0 and vice versa.
	AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
	return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
	}
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation all vectors of i8 and i16 without BWI.
	// Make sure we extend these even before type legalization gets a chance to
	// split wide vectors.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	// AVX512 - Extend select with zero to merge with target shuffle.
	// select(mask, extract_subvector(shuffle(x)), zero) -->
	// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
	// TODO - support non target shuffles as well.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1) {
	auto SelectableOp = [&TLI](SDValue Op) {
	return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isTargetShuffle(Op.getOperand(0).getOpcode()) &&
	isNullConstant(Op.getOperand(1)) &&
	TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
	Op.hasOneUse() && Op.getOperand(0).hasOneUse();
	};

	bool SelectableLHS = SelectableOp(LHS);
	bool SelectableRHS = SelectableOp(RHS);
	bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

	if ((SelectableLHS && ZeroRHS) \|\| (SelectableRHS && ZeroLHS)) {
	EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
	: RHS.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
	LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
	VT.getSizeInBits());
	RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
	VT.getSizeInBits());
	Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
	DAG.getUNDEF(SrcCondVT), Cond,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
	return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
	}
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	Cond.hasOneUse() &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	Other->getOperand(0) == Cond.getOperand(0)) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
	if (isa<BuildVectorSDNode>(CondRHS)) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	auto MatchUSUBSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return (!Op && !Cond) \|\|
	(Op && Cond &&
	Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
	};
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
	/AllowUndefs/ true)) {
	OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	OpRHS);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask()) {
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}
	}
	}
	}

	// Match VSELECTs into add with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// paddus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	SDValue CondLHS = Cond->getOperand(0);
	SDValue CondRHS = Cond->getOperand(1);

	// Check if one of the arms of the VSELECT is vector with all bits set.
	// If it's on the left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
	} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
	SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);

	// Canonicalize condition operands.
	if (CC == ISD::SETUGE) {
	std::swap(CondLHS, CondRHS);
	CC = ISD::SETULE;
	}

	// We can test against either of the addition operands.
	// x <= x+y ? x+y : ~0 --> addus x, y
	// x+y >= x ? x+y : ~0 --> addus x, y
	if (CC == ISD::SETULE && Other == CondRHS &&
	(OpLHS == CondLHS \|\| OpRHS == CondLHS))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

	if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
	CondLHS == OpLHS) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > ~C ? x+C : ~0 --> addus x, C
	auto MatchUADDSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return Cond->getAPIntValue() == ~Op->getAPIntValue();
	};
	if (CC == ISD::SETULE &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
	return V;

	// select(~Cond, X, Y) -> select(Cond, Y, X)
	if (CondVT.getScalarType() != MVT::i1)
	if (SDValue CondNot = IsNOT(Cond, DAG))
	return DAG.getNode(N->getOpcode(), DL, VT,
	DAG.getBitcast(CondVT, CondNot), RHS, LHS);

	// Custom action for SELECT MMX
	if (VT == MVT::x86mmx) {
	LHS = DAG.getBitcast(MVT::i64, LHS);
	RHS = DAG.getBitcast(MVT::i64, RHS);
	SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	return DAG.getBitcast(VT, newSelect);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
	uint64_t CarryCC = Carry.getConstantOperandVal(0);
	SDValue CarryOp1 = Carry.getOperand(1);
	if (CarryCC == X86::COND_B)
	return CarryOp1;
	if (CarryCC == X86::COND_A) {
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp
	// instruction cannot take an immediate as its first operand.
	//
	if (CarryOp1.getOpcode() == X86ISD::SUB &&
	CarryOp1.getNode()->hasOneUse() &&
	CarryOp1.getValueType().isInteger() &&
	!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
	SDValue SubCommute =
	DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
	CarryOp1.getOperand(1), CarryOp1.getOperand(0));
	return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
	}
	}
	// If this is a check of the z flag of an add with 1, switch to the
	// C flag.
	if (CarryCC == X86::COND_E &&
	CarryOp1.getOpcode() == X86ISD::ADD &&
	isOneConstant(CarryOp1.getOperand(1)))
	return CarryOp1;
	}
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	// cmov X, X, ?, ? --> X
	if (TrueOp == FalseOp)
	return TrueOp;

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
	assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
	"Implicit constant truncation");

	bool isFastMultiplier = false;
	if (Diff.ult(10)) {
	switch (Diff.getZExtValue()) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = {FalseOp, Cond.getOperand(0),
	DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp,
	DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
	Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
	// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
	// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
	// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
	if ((CC == X86::COND_NE \|\| CC == X86::COND_E) &&
	Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
	SDValue Add = TrueOp;
	SDValue Const = FalseOp;
	// Canonicalize the condition code for easier matching and output.
	if (CC == X86::COND_E)
	std::swap(Add, Const);

	// We might have replaced the constant in the cmov with the LHS of the
	// compare. If so change it to the RHS of the compare.
	if (Const == Cond.getOperand(0))
	Const = Cond.getOperand(1);

	// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
	if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
	Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
	(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF \|\|
	Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
	Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
	EVT VT = N->getValueType(0);
	// This should constant fold.
	SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
	SDValue CMov =
	DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
	DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
	return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	IsPositive[i] = DAG.SignBitIsZero(Opd);
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = ShrinkMode::MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = ShrinkMode::MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = ShrinkMode::MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = ShrinkMode::MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == ShrinkMode::MULU8 \|\| Mode == ShrinkMode::MULS8)
	return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
	: ISD::SIGN_EXTEND,
	DL, VT, MulLo);

	MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi =
	DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, const SDLoc &DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mul1, DL, VT));
	Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
	DAG.getConstant(Mul2, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 41:
	// mul x, 41 => add ((shl (mul x, 5), 3), x)
	return combineMulShlAddOrSub(5, 3, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => add ((shl (mul x, 9), 1), x)
	return combineMulShlAddOrSub(9, 1, /isAdd/ true);
	case 37:
	// mul x, 37 => add ((shl (mul x, 9), 2), x)
	return combineMulShlAddOrSub(9, 2, /isAdd/ true);
	case 73:
	// mul x, 73 => add ((shl (mul x, 9), 3), x)
	return combineMulShlAddOrSub(9, 3, /isAdd/ true);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 23 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 26:
	// mul x, 26 => add ((mul (mul x, 5), 5), x)
	return combineMulMulAddOrSub(5, 5, /isAdd/ true);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(9, 3, /isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(9, 3, /isAdd/ true));
	}

	// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
	// by a single LEA.
	// First check if this a sum of two power of 2s because that's easy. Then
	// count how many zeros are up to the first bit.
	// TODO: We can do this even without LEA at a cost of two shifts and an add.
	if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
	unsigned ScaleShift = countTrailingZeros(MulAmt);
	if (ScaleShift >= 1 && ScaleShift < 4) {
	unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
	SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ScaleShift, DL, MVT::i8));
	return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
	}
	}

	return SDValue();
	}

	// If the upper 17 bits of each element are zero then we can use PMADDWD,
	// which is always at least as quick as PMULLD, except on KNL.
	static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Subtarget.isPMADDWDSlow())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi32 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
	// Also allow v2i32 if it will be widened.
	MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
	if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// If we are zero extending two steps without SSE4.1, its better to reduce
	// the vmul width instead.
	if (!Subtarget.hasSSE41() &&
	(N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
	(N1.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOperand(0).getScalarValueSizeInBits() <= 8))
	return SDValue();

	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (!DAG.MaskedValueIsZero(N1, Mask17) \|\|
	!DAG.MaskedValueIsZero(N0, Mask17))
	return SDValue();

	// Use SplitOpsAndApply to handle AVX splitting.
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
	PMADDWDBuilder);
	}

	static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi64 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i64 \|\|
	VT.getVectorNumElements() < 2 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
	DAG.ComputeNumSignBits(N1) > 32) {
	auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULDQBuilder, /CheckBWI/false);
	}

	// If the upper bits are zero we can use a single pmuludq.
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
	auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULUDQBuilder, /CheckBWI/false);
	}

	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
	return V;

	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	if (isPowerOf2_64(C->getZExtValue()))
	return SDValue();

	int64_t SignMulAmt = C->getSExtValue();
	assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
	uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

	SDLoc DL(N);
	if (AbsMulAmt == 3 \|\| AbsMulAmt == 5 \|\| AbsMulAmt == 9) {
	SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(AbsMulAmt, DL, VT));
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);

	return NewMul;
	}

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((AbsMulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = AbsMulAmt / 9;
	} else if ((AbsMulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = AbsMulAmt / 5;
	} else if ((AbsMulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = AbsMulAmt / 3;
	}

	SDValue NewMul;
	// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\|
	(SignMulAmt >= 0 && (MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)))) {

	if (isPowerOf2_64(MulAmt2) &&
	!(SignMulAmt >= 0 && N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add. Only do this for positive multiply amounts since the
	// negate would prevent it from being used as an address mode anyway.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));

	// Negate the result.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

	if (!NewMul) {
	assert(C->getZExtValue() != 0 &&
	C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	if (isPowerOf2_64(AbsMulAmt - 1)) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
	MVT::i8)));
	// To negate, subtract the number from zero
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), NewMul);
	} else if (isPowerOf2_64(AbsMulAmt + 1)) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 1),
	DL, MVT::i8));
	// To negate, reverse the operands of the subtract.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
	else
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
	// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
	// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	}
	}

	return NewMul;
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = N0.getConstantOperandAPInt(1);
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->isOne())
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// Only do this on the last DAG combine as it can interfere with other
	// combines.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();

	// If this can be matched by a zero extend, don't optimize.
	if (MaskVal.isMask()) {
	unsigned TO = MaskVal.countTrailingOnes();
	if (TO >= 8 && isPowerOf2_32(TO))
	return SDValue();
	}

	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected shift opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	bool IsSigned = (X86ISD::PACKSS == Opcode);

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0.isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1.isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
	// truncate to create a larger truncate.
	if (Subtarget.hasAVX512() &&
	N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
	N0.getOperand(0).getValueType() == MVT::v8i32) {
	if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) \|\|
	(!IsSigned &&
	DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
	if (Subtarget.hasVLX())
	return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

	// Widen input to v16i32 so we can truncate that.
	SDLoc dl(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
	N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
	}
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert((X86ISD::VSHL == N->getOpcode() \|\| X86ISD::VSRA == N->getOpcode() \|\|
	X86ISD::VSRL == N->getOpcode()) &&
	"Unexpected shift opcode");
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Detect constant shift amounts.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
	unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
	return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
	EltBits[0].getZExtValue(), DAG);
	}

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");
	assert(N->getOperand(1).getValueType() == MVT::i8 &&
	"Unexpected shift amount type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	unsigned ShiftVal = N->getConstantOperandVal(1);
	if (ShiftVal >= NumBitsPerElt) {
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
	// clamped to (NumBitsPerElt - 1).
	if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
	unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
	unsigned NewShiftVal = ShiftVal + ShiftVal2;
	if (NewShiftVal >= NumBitsPerElt)
	NewShiftVal = NumBitsPerElt - 1;
	return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
	DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	// Undef elements need to fold to 0. It's possible SimplifyDemandedBits
	// created an undef input due to no input bits being demanded, but user
	// still expects 0 in other bits.
	for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
	APInt &Elt = EltBits[i];
	if (UndefElts[i])
	Elt = 0;
	else if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftVal;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftVal);
	else
	Elt.lshrInPlace(ShiftVal);
	}
	// Reset undef elements since they were zeroed above.
	UndefElts = 0;
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
	"Unexpected vector insertion");

	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0.getOperand(1);
	SDValue CMP1 = N1.getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getTargetConstant(x86cc, DL, MVT::i8));
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
	DAG.getConstant(0, DL, MVT::v16i1),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
	N->getSimpleValueType(0));
	}
	SDValue OnesOrZeroesF =
	DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
	CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDValue X, Y;
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	if (SDValue Not = IsNOT(N0, DAG)) {
	X = Not;
	Y = N1;
	} else if (SDValue Not = IsNOT(N1, DAG)) {
	X = Not;
	Y = N0;
	} else
	return SDValue();

	X = DAG.getBitcast(VT, X);
	Y = DAG.getBitcast(VT, Y);
	return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0.getOperand(0).getValueType() != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc &&
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0.getOperand(0);
	if (RHSTrunc)
	N1 = N1.getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (N0.getOpcode() != ISD::BITCAST \|\| N1.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();

	// Ensure that both types are the same and are legal scalar fp types.
	if (N00Type != N10Type \|\|
	!((Subtarget.hasSSE1() && N00Type == MVT::f32) \|\|
	(Subtarget.hasSSE2() && N00Type == MVT::f64)))
	return SDValue();

	unsigned FPOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected input node for FP logic conversion");
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	}

	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	// Don't prevent creation of ANDN.
	if (isBitwiseNot(Op0))
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
	Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
	// Turn it into series of XORs and a setnp.
	static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// We only support 64-bit and 32-bit. 64-bit requires special handling
	// unless the 64-bit popcnt instruction is legal.
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// LHS needs to be a single use CTPOP.
	if (N0.getOpcode() != ISD::CTPOP \|\| !N0.hasOneUse())
	return SDValue();

	// RHS needs to be 1.
	if (!isOneConstant(N1))
	return SDValue();

	SDLoc DL(N);
	SDValue X = N0.getOperand(0);

	// If this is 64-bit, its always best to xor the two 32-bit pieces together
	// even if we have popcnt.
	if (VT == MVT::i64) {
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(32, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
	X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
	// Generate a 32-bit parity idiom. This will bring us back here if we need
	// to expand it too.
	SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
	DAG.getConstant(1, DL, MVT::i32));
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
	}
	assert(VT == MVT::i32 && "Unexpected VT!");

	// Xor the high and low 16-bits together using a 32-bit operation.
	SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(16, DL, MVT::i8));
	X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);

	// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
	// This should allow an h-reg to be used to save a shift.
	// FIXME: We only get an h-reg in 32-bit mode.
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(8, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
	SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
	SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

	// Copy the inverse of the parity flag into a register with setcc.
	SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
	// Zero extend to original type.
	return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
	}


	// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
	// Where C is a mask containing the same number of bits as the setcc and
	// where the setcc will freely 0 upper bits of k-register. We can replace the
	// undef in the concat with 0s and remove the AND. This mainly helps with
	// v2i1/v4i1 setcc being casted to scalar.
	static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

	EVT VT = N->getValueType(0);

	// Make sure this is an AND with constant. We will check the value of the
	// constant later.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	// This is implied by the ConstantSDNode.
	assert(!VT.isVector() && "Expected scalar VT!");

	if (N->getOperand(0).getOpcode() != ISD::BITCAST \|\|
	!N->getOperand(0).hasOneUse() \|\|
	!N->getOperand(0).getOperand(0).hasOneUse())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Src = N->getOperand(0).getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isVector() \|\| SrcVT.getVectorElementType() != MVT::i1 \|\|
	!TLI.isTypeLegal(SrcVT))
	return SDValue();

	if (Src.getOpcode() != ISD::CONCAT_VECTORS)
	return SDValue();

	// We only care about the first subvector of the concat, we expect the
	// other subvectors to be ignored due to the AND if we make the change.
	SDValue SubVec = Src.getOperand(0);
	EVT SubVecVT = SubVec.getValueType();

	// First subvector should be a setcc with a legal result type. The RHS of the
	// AND should be a mask with this many bits.
	if (SubVec.getOpcode() != ISD::SETCC \|\| !TLI.isTypeLegal(SubVecVT) \|\|
	!N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
	return SDValue();

	EVT SetccVT = SubVec.getOperand(0).getValueType();
	if (!TLI.isTypeLegal(SetccVT) \|\|
	!(Subtarget.hasVLX() \|\| SetccVT.is512BitVector()))
	return SDValue();

	if (!(Subtarget.hasBWI() \|\| SetccVT.getScalarSizeInBits() >= 32))
	return SDValue();

	// We passed all the checks. Rebuild the concat_vectors with zeroes
	// and cast it back to VT.
	SDLoc dl(N);
	SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
	DAG.getConstant(0, dl, SubVecVT));
	Ops[0] = SubVec;
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
	Ops);
	return DAG.getBitcast(VT, Concat);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	// Use a 32-bit and+zext if upper bits known zero.
	if (VT == MVT::i64 && Subtarget.is64Bit() &&
	!isa<ConstantSDNode>(N->getOperand(1))) {
	APInt HiMask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) \|\|
	DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
	SDLoc dl(N);
	SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
	SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
	DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
	}
	}

	// This must be done before legalization has expanded the ctpop.
	if (SDValue V = combineParity(N, DAG, Subtarget))
	return V;

	// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
	Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
	if (Mask) {
	APInt AllBits = APInt::getAllOnesValue(NumElts);
	return DAG.getSetCC(dl, MVT::i1, Mask,
	DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
	}
	}
	}

	if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
	return V;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 1,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
	static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	MVT VT = N->getSimpleValueType(0);
	if (!VT.isVector() \|\| (VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	SDValue N0 = peekThroughBitcasts(N->getOperand(0));
	SDValue N1 = peekThroughBitcasts(N->getOperand(1));
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND)
	return SDValue();

	// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
	// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
	bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) \|\|
	Subtarget.hasVLX();
	if (!(Subtarget.hasXOP() \|\| UseVPTERNLOG \|\|
	!N0.getOperand(1).hasOneUse() \|\| !N1.getOperand(1).hasOneUse()))
	return SDValue();

	// Attempt to extract constant byte masks.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
	false, false))
	return SDValue();
	if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
	false, false))
	return SDValue();

	for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
	// TODO - add UNDEF elts support.
	if (UndefElts0[i] \|\| UndefElts1[i])
	return SDValue();
	if (EltBits0[i] != ~EltBits1[i])
	return SDValue();
	}

	SDLoc DL(N);
	SDValue X = N->getOperand(0);
	SDValue Y =
	DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
	DAG.getBitcast(VT, N1.getOperand(0)));
	return DAG.getNode(ISD::OR, DL, VT, X, Y);
	}

	// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
	static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
	if (N->getOpcode() != ISD::OR)
	return false;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return false;

	Mask = N1.getOperand(0);
	X = N1.getOperand(1);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	else if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);
	else
	return false;

	// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	return true;
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	SDValue X, Y, Mask;
	if (!matchLogicBlend(N, X, Y, Mask))
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Attempt to combine to conditional negate: (sub (xor X, M), M)
	if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
	DAG, Subtarget))
	return Res;

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, MVT::i8));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) \|\|
	!TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.shouldOptForSize();
	unsigned Bits = VT.getScalarSizeInBits();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != ShiftVT)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != ShiftVT)
	return SDValue();

	// Peek through any modulo shift masks.
	SDValue ShMsk0;
	if (ShAmt0.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
	ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk0 = ShAmt0;
	ShAmt0 = ShAmt0.getOperand(0);
	}
	SDValue ShMsk1;
	if (ShAmt1.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
	ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk1 = ShAmt1;
	ShAmt1 = ShAmt1.getOperand(0);
	}

	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = ISD::FSHL;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\| ShAmt0.getOpcode() == ISD::XOR) {
	Opc = ISD::FSHR;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	std::swap(ShMsk0, ShMsk1);
	}

	auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
	SDValue Amt) {
	if (Opc == ISD::FSHR)
	std::swap(Op0, Op1);
	return DAG.getNode(Opc, DL, VT, Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
	};

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
	// OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
	ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk1 = ShAmt1Op1;
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	}
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if ((SumC->getAPIntValue() == Bits \|\|
	(SumC->getAPIntValue() == 0 && ShMsk1)) &&
	ShAmt1Op1 == ShAmt0)
	return GetFunnelShift(Op0, Op1, ShAmt0);
	}
	} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return GetFunnelShift(Op0, Op1, ShAmt0);
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) &&
	(ShAmt1Op0 == ShAmt0 \|\| ShAmt1Op0 == ShMsk0)) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandAPInt(1).isOneValue()) {
	return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
	}
	}
	}
	}

	return SDValue();
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
	Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
	if (Mask) {
	APInt AllBits = APInt::getNullValue(NumElts);
	return DAG.getSetCC(dl, MVT::i1, Mask,
	DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
	}
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
	return R;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine an OR of shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt =
	isConstOrConstSplat(Shift.getOperand(1), /AllowUndefs/ true);
	if (!ShiftAmt \|\|
	ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
	}

	/// Detect patterns of truncation with unsigned saturation:
	///
	/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value x to be truncated or SDValue() if the pattern was
	/// not matched.
	///
	/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
	/// where C1 >= 0 and C2 is unsigned max of destination type.
	///
	/// (truncate (smax (smin (x, C2), C1)) to dest_type)
	/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
	///
	/// These two patterns are equivalent to:
	/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
	/// So return the smax(x, C1) value to be truncated or SDValue() if the
	/// pattern was not matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) {
	EVT InVT = In.getValueType();

	// Saturation with truncation. We truncate from InVT to VT.
	assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	// Match min/max and return limit value as a parameter.
	auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
	return V.getOperand(0);
	return SDValue();
	};

	APInt C1, C2;
	if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
	// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	if (C2.isMask(VT.getScalarSizeInBits()))
	return UMin;

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
	if (MatchMinMax(SMin, ISD::SMAX, C1))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
	return SMin;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
	C2.uge(C1)) {
	return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
	}

	return SDValue();
	}

	/// Detect patterns of truncation with signed saturation:
	/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
	/// signed_max_of_dest_type)) to dest_type)
	/// or:
	/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
	/// signed_min_of_dest_type)) to dest_type).
	/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
	unsigned NumDstBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = In.getScalarValueSizeInBits();
	assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");

	auto MatchMinMax = [](SDValue V, unsigned Opcode,
	const APInt &Limit) -> SDValue {
	APInt C;
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
	return V.getOperand(0);
	return SDValue();
	};

	APInt SignedMax, SignedMin;
	if (MatchPackUS) {
	SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
	SignedMin = APInt(NumSrcBits, 0);
	} else {
	SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
	SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
	}

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
	if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
	return SMax;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
	return SMin;

	return SDValue();
	}

	static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2() \|\| !VT.isVector())
	return SDValue();

	EVT SVT = VT.getVectorElementType();
	EVT InVT = In.getValueType();
	EVT InSVT = InVT.getVectorElementType();

	// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
	// split across two registers. We can use a packusdw+perm to clamp to 0-65535
	// and concatenate at the same time. Then we can use a final vpmovuswb to
	// clip to 0-255.
	if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	InVT == MVT::v16i32 && VT == MVT::v16i8) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
	DL, DAG, Subtarget);
	assert(Mid && "Failed to pack!");
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
	}
	}

	// vXi32 truncate instructions are available with AVX512F.
	// vXi16 truncate instructions are only available with AVX512BW.
	// For 256-bit or smaller vectors, we require VLX.
	// FIXME: We could widen truncates to 512 to remove the VLX restriction.
	// If the result type is 256-bits or larger and we have disable 512-bit
	// registers, we should go ahead and use the pack instructions if possible.
	bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) \|\|
	(Subtarget.hasBWI() && InSVT == MVT::i16)) &&
	(InVT.getSizeInBits() > 128) &&
	(Subtarget.hasVLX() \|\| InVT.getSizeInBits() > 256) &&
	!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

	if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
	VT.getSizeInBits() >= 64 &&
	(SVT == MVT::i8 \|\| SVT == MVT::i16) &&
	(InSVT == MVT::i16 \|\| InSVT == MVT::i32)) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
	// Only do this when the result is at least 64 bits or we'll leaving
	// dangling PACKSSDW nodes.
	if (SVT == MVT::i8 && InSVT == MVT::i32) {
	EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
	DAG, Subtarget);
	assert(Mid && "Failed to pack!");
	SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
	Subtarget);
	assert(V && "Failed to pack!");
	return V;
	} else if (SVT == MVT::i8 \|\| Subtarget.hasSSE41())
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
	Subtarget);
	}
	if (auto SSatVal = detectSSatPattern(In, VT))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
	Subtarget);
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
	Subtarget.hasAVX512() && (InSVT != MVT::i16 \|\| Subtarget.hasBWI())) {
	unsigned TruncOpc = 0;
	SDValue SatVal;
	if (auto SSatVal = detectSSatPattern(In, VT)) {
	SatVal = SSatVal;
	TruncOpc = X86ISD::VTRUNCS;
	} else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
	SatVal = USatVal;
	TruncOpc = X86ISD::VTRUNCUS;
	}
	if (SatVal) {
	unsigned ResElts = VT.getVectorNumElements();
	// If the input type is less than 512 bits and we don't have VLX, we need
	// to widen to 512 bits.
	if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
	unsigned NumConcats = 512 / InVT.getSizeInBits();
	ResElts *= NumConcats;
	SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
	ConcatOps[0] = SatVal;
	InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
	NumConcats * InVT.getVectorNumElements());
	SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
	}
	// Widen the result if its narrower than 128 bits.
	if (ResElts * SVT.getSizeInBits() < 128)
	ResElts = 128 / SVT.getSizeInBits();
	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
	SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	NumElems >= 2 && isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	const APInt &Val = C->getAPIntValue();
	if (Val.ult(Min) \|\| Val.ugt(Max))
	return false;
	}
	return true;
	};

	// Check if each element of the vector is right-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
	};

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Operands[0].getOperand(0), Operands[1] },
	AVGBuilder);
	}

	// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
	// Match the or case only if its 'add-like' - can be replaced by an add.
	auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
	if (ISD::ADD == V.getOpcode()) {
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	}
	if (ISD::ZERO_EXTEND != V.getOpcode())
	return false;
	V = V.getOperand(0);
	if (V.getValueType() != VT \|\| ISD::OR != V.getOpcode() \|\|
	!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
	return false;
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	};

	SDValue Op0, Op1;
	if (FindAddLike(Operands[0], Op0, Op1))
	std::swap(Operands[0], Operands[1]);
	else if (!FindAddLike(Operands[1], Op0, Op1))
	return SDValue();
	Operands[2] = Op0;
	Operands[1] = Op1;

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two can be promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getValueType() != VT) {
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();
	Operands[j] = Operands[j].getOperand(0);
	}

	// The pattern is detected, emit X86ISD::AVG instruction(s).
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
	AVGBuilder);
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	*Ld->getMemOperand(), &Fast) &&
	!Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	unsigned HalfAlign = 16;
	SDValue Ptr1 = Ld->getBasePtr();
	SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems / 2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());
	SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
	Ld->getPointerInfo().getWithOffset(HalfAlign),
	MinAlign(Alignment, HalfAlign),
	Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1), Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	// Bool vector load - attempt to cast to an integer, as we have good
	// (vXiY *ext(vXi1 bitcast(iX))) handling.
	if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
	RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
	unsigned NumElts = RegVT.getVectorNumElements();
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	if (TLI.isTypeLegal(IntVT)) {
	SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Alignment,
	Ld->getMemOperand()->getFlags());
	SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
	return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
	}
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(ML->isUnindexed() && "Unexpected indexed masked load!");
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	ML->getPassThru(), Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(ML->isUnindexed() && "Unexpected indexed masked load!");
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
	ML->getPassThru());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getPassThru().isUndef())
	return SDValue();

	if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(
	VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
	DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
	ML->getAddressingMode(), ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
	ML->getPassThru());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	return SDValue();
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
	if (Mst->isCompressingStore())
	return SDValue();

	EVT VT = Mst->getValue().getValueType();
	SDLoc dl(Mst);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (Mst->isTruncatingStore())
	return SDValue();

	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mst->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	SDValue Value = Mst->getValue();
	if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	Mst->getMemoryVT())) {
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
	Mst->getBasePtr(), Mst->getOffset(), Mask,
	Mst->getMemoryVT(), Mst->getMemOperand(),
	Mst->getAddressingMode(), true);
	}

	return SDValue();
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	unsigned Alignment = St->getAlignment();
	SDValue StoredVal = St->getValue();
	EVT VT = StoredVal.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert a store of vXi1 into a store of iX and a bitcast.
	if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1) {

	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	StoredVal = DAG.getBitcast(NewVT, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
	// This will avoid a copy to k-register.
	if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
	StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	StoredVal.getOperand(0).getValueType() == MVT::i8) {
	return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
	St->getBasePtr(), St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	}

	// Widen v2i1/v4i1 stores to v8i1.
	if ((VT == MVT::v2i1 \|\| VT == MVT::v4i1) && VT == StVT &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / VT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
	Ops[0] = StoredVal;
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// Turn vXi1 stores of constants into a scalar store.
	if ((VT == MVT::v8i1 \|\| VT == MVT::v16i1 \|\| VT == MVT::v32i1 \|\|
	VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
	ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
	// If its a v64i1 store without 64-bit support, we need two stores.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(0, 32));
	Lo = combinevXi1ConstantToInteger(Lo, DAG);
	SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(32, 32));
	Hi = combinevXi1ConstantToInteger(Hi, DAG);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Hi, Ptr1,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(Alignment, 4U),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// If we are saving a 32-byte vector and 32-byte stores are slow, such as on
	// Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*St->getMemOperand(), &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	return splitVectorStore(St, DAG);
	}

	// Split under-aligned vector non-temporal stores.
	if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
	// ZMM/YMM nt-stores - either it can be stored as a series of shorter
	// vectors or the legalizer can scalarize it to use MOVNTI.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();
	return splitVectorStore(St, DAG);
	}

	// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
	// to use MOVNTI.
	if (VT.is128BitVector() && Subtarget.hasSSE2()) {
	MVT NTVT = Subtarget.hasSSE4A()
	? MVT::v2f64
	: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
	return scalarizeVectorStore(St, NTVT, DAG);
	}
	}

	// Try to optimize v16i16->v16i8 truncating stores when BWI is not
	// supported, but avx512f is by extending to v16i32 and truncating.
	if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
	St->getValue().getOpcode() == ISD::TRUNCATE &&
	St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
	TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
	St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
	return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
	MVT::v16i8, St->getMemOperand());
	}

	// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
	if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
	(StoredVal.getOpcode() == X86ISD::VTRUNCUS \|\|
	StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
	TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
	bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
	return EmitTruncSStore(IsSigned, St->getChain(),
	dl, StoredVal.getOperand(0), St->getBasePtr(),
	VT, St->getMemOperand(), DAG);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (DCI.isBeforeLegalize() \|\| TLI.isTypeLegal(St->getMemoryVT()))
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (TLI.isTruncStoreLegal(VT, StVT)) {
	if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
	return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
	DAG, dl))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	}

	return SDValue();
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
	isa<LoadSDNode>(St->getValue()) &&
	cast<LoadSDNode>(St->getValue())->isSimple() &&
	St->getChain().hasOneUse() && St->isSimple()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// Avoid the transformation if there are multiple uses of the loaded value.
	if (!Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// Lower to a single movq load/store pair.
	SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
	Ld->getBasePtr(), Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool IsCommutative) {
	// If either operand is undef, bail out. The binop should be simplified.
	if (LHS.isUndef() \|\| RHS.isUndef())
	return false;

	// Look for the following pattern:
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	MVT VT = LHS.getSimpleValueType();
	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");
	unsigned NumElts = VT.getVectorNumElements();

	// TODO - can we make a general helper method that does all of this for us?
	auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
	SmallVectorImpl<int> &ShuffleMask) {
	if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!Op.getOperand(0).isUndef())
	N0 = Op.getOperand(0);
	if (!Op.getOperand(1).isUndef())
	N1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
	ShuffleMask.append(Mask.begin(), Mask.end());
	return;
	}
	bool UseSubVector = false;
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op.getOperand(0).getValueType().is256BitVector() &&
	llvm::isNullConstant(Op.getOperand(1))) {
	Op = Op.getOperand(0);
	UseSubVector = true;
	}
	bool IsUnary;
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<int, 16> SrcShuffleMask;
	SDValue BC = peekThroughBitcasts(Op);
	if (isTargetShuffle(BC.getOpcode()) &&
	getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
	SrcOps, SrcShuffleMask, IsUnary)) {
	if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
	SrcOps.size() <= 2) {
	N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
	N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
	ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
	}
	if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
	SrcOps.size() == 1) {
	N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
	N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
	ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
	ShuffleMask.append(Mask.begin(), Mask.end());
	}
	}
	};

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle, then pretend it is the identity shuffle:
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: A default initialized SDValue represents an UNDEF of type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask;
	GetShuffle(LHS, A, B, LMask);

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask;
	GetShuffle(RHS, C, D, RMask);

	// At least one of the operands should be a vector shuffle.
	unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
	if (NumShuffles == 0)
	return false;

	if (LMask.empty()) {
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask.push_back(i);
	}

	if (RMask.empty()) {
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask.push_back(i);
	}

	// If A and B occur in reverse order in RHS, then canonicalize by commuting
	// RHS operands and shuffle mask.
	if (A != C) {
	std::swap(C, D);
	ShuffleVectorSDNode::commuteMask(RMask);
	}
	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D))
	return false;

	// LHS and RHS are now:
	// LHS = shuffle A, B, LMask
	// RHS = shuffle A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
	// so we just repeat the inner loop if this is a 256-bit op.
	unsigned Num128BitChunks = VT.getSizeInBits() / 128;
	unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
	assert((NumEltsPer128BitChunk % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
	for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
	// Ignore undefined components.
	int LIdx = LMask[i + j], RIdx = RMask[i + j];
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// The low half of the 128-bit result must choose from A.
	// The high half of the 128-bit result must choose from B,
	// unless B is undef. In that case, we are always choosing from A.
	unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
	unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

	if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
	return false;

	LHS = DAG.getBitcast(VT, LHS);
	RHS = DAG.getBitcast(VT, RHS);
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
	return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
	/// anything that is guaranteed to be transformed by DAGCombiner.
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned SrcOpcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsFreeTruncation = [VT](SDValue Op) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// See if this has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode = Op.getOpcode();
	if ((Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND \|\|
	Opcode == ISD::ZERO_EXTEND) &&
	Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if this is a single use constant which can be constant folded.
	// NOTE: We don't peek throught bitcasts here because there is currently
	// no support for constant folding truncate+bitcast+vector_of_constants. So
	// we'll just send up with a truncate on both operands which will
	// get turned back into (truncate (binop)) causing an infinite loop.
	return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!Src.hasOneUse())
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (SrcOpcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 &&
	TLI.isOperationLegal(SrcOpcode, VT) &&
	!TLI.isOperationLegal(SrcOpcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	case ISD::SUB: {
	// TODO: ISD::SUB We are conservative and require both sides to be freely
	// truncatable to avoid interfering with combineSubToSubus.
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate using ISD::AND mask and X86ISD::PACKUS.
	/// e.g. trunc <8 x i32> X to <8 x i16> -->
	/// MaskX = X & 0xffff (clear high bits to prevent saturation)
	/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
	static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);

	APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
	OutVT.getScalarSizeInBits());
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
	return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);
	In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
	DAG.getValueType(OutVT));
	return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);
	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
	if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS/PACKUS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	// AVX512 has fast truncate, but if the input is already going to be split,
	// there's no harm in trying pack.
	if (Subtarget.hasAVX512() &&
	!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
	InVT.is512BitVector()))
	return SDValue();

	unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known = DAG.computeKnownBits(In);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	// Try to form a MULHU or MULHS node by looking for
	// (trunc (srl (mul ext, ext), 16))
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// First instruction should be a right shift of a multiply.
	if (Src.getOpcode() != ISD::SRL \|\|
	Src.getOperand(0).getOpcode() != ISD::MUL)
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Only handle vXi16 types that are at least 128-bits unless they will be
	// widened.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i16)
	return SDValue();

	// Input type should be vXi32.
	EVT InVT = Src.getValueType();
	if (InVT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = Src.getOperand(0).getOperand(0);
	SDValue RHS = Src.getOperand(0).getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	if (LHS.getValueType() != VT \|\| RHS.getValueType() != VT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	return DAG.getNode(Opc, DL, VT, LHS, RHS);
	}

	// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
	// from one vector with signed bytes from another vector, adds together
	// adjacent pairs of 16-bit products, and saturates the result before
	// truncating to 16-bits.
	//
	// Which looks something like this:
	// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
	// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
	static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !Subtarget.hasSSSE3())
	return SDValue();

	unsigned NumElems = VT.getVectorNumElements();
	EVT ScalarVT = VT.getVectorElementType();
	if (ScalarVT != MVT::i16 \|\| NumElems < 8 \|\| !isPowerOf2_32(NumElems))
	return SDValue();

	SDValue SSatVal = detectSSatPattern(In, VT);
	if (!SSatVal \|\| SSatVal.getOpcode() != ISD::ADD)
	return SDValue();

	// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
	// of multiplies from even/odd elements.
	SDValue N0 = SSatVal.getOperand(0);
	SDValue N1 = SSatVal.getOperand(1);

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// TODO: Handle constant vectors and use knownbits/computenumsignbits?
	// Canonicalize zero_extend to LHS.
	if (N01.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N00, N01);
	if (N11.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N10, N11);

	// Ensure we have a zero_extend and a sign_extend.
	if (N00.getOpcode() != ISD::ZERO_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::ZERO_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Ensure the extend is from vXi8.
	if (N00.getValueType().getVectorElementType() != MVT::i8 \|\|
	N01.getValueType().getVectorElementType() != MVT::i8 \|\|
	N10.getValueType().getVectorElementType() != MVT::i8 \|\|
	N11.getValueType().getVectorElementType() != MVT::i8)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// N00/N10 are zero extended. N01/N11 are sign extended.

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue ZExtIn, SExtIn;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!ZExtIn) {
	ZExtIn = N00In;
	SExtIn = N01In;
	}
	if (ZExtIn != N00In \|\| SExtIn != N01In \|\|
	ZExtIn != N10In \|\| SExtIn != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i8 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
	PMADDBuilder);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to detect PMADD
	if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
	return PMAdd;

	// Try to combine truncation with signed/unsigned saturation.
	if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// Try to combine PMULHUW/PMULHW for vXi16.
	if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
	return V;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	SDLoc DL(N);

	if (auto SSatVal = detectSSatPattern(In, VT))
	return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
	if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

	return SDValue();
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
	/// or FSUB(0, x)
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	/// This also recognizes splat of a negated value and returns the splat of that
	/// value.
	static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	// Don't recurse exponentially.
	if (Depth > SelectionDAG::MaxRecursionDepth)
	return SDValue();

	unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	EVT VT = Op->getValueType(0);

	// Make sure the element size doesn't change.
	if (VT.getScalarSizeInBits() != ScalarSize)
	return SDValue();

	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	case ISD::VECTOR_SHUFFLE: {
	// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
	// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
	if (!Op.getOperand(1).isUndef())
	return SDValue();
	if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
	if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
	return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
	cast<ShuffleVectorSDNode>(Op)->getMask());
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
	// -V, INDEX).
	SDValue InsVector = Op.getOperand(0);
	SDValue InsVal = Op.getOperand(1);
	if (!InsVector.isUndef())
	return SDValue();
	if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
	if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
	NegInsVal, Op.getOperand(2));
	break;
	}
	case ISD::FSUB:
	case ISD::XOR:
	case X86ISD::FXOR: {
	SDValue Op1 = Op.getOperand(1);
	SDValue Op0 = Op.getOperand(0);

	// For XOR and FXOR, we want to check if constant
	// bits of Op1 are sign bit masks. For FSUB, we
	// have to check if constant bits of Op0 are sign
	// bit masks and hence we swap the operands.
	if (Opc == ISD::FSUB)
	std::swap(Op0, Op1);

	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	// Extract constant bits and see if they are all
	// sign bit masks. Ignore the undef elements.
	if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
	/* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false)) {
	for (unsigned I = 0, E = EltBits.size(); I < E; I++)
	if (!UndefElts[I] && !EltBits[I].isSignMask())
	return SDValue();

	return peekThroughBitcasts(Op0);
	}
	}
	}

	return SDValue();
	}

	static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
	bool NegRes) {
	if (NegMul) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMADD: Opcode = ISD::FMA; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
	}
	}

	if (NegAcc) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = ISD::FMA; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
	}
	}

	if (NegRes) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	}
	}

	return Opcode;
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(DAG, N);
	if (!Arg)
	return SDValue();

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
	switch (Arg.getOpcode()) {
	case ISD::FMA:
	case X86ISD::FMSUB:
	case X86ISD::FNMADD:
	case X86ISD::FNMSUB:
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB_RND: {
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
	}
	}
	}

	return SDValue();
	}

	char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations,
	bool ForCodeSize,
	unsigned Depth) const {
	// fneg patterns are removable even if they have multiple uses.
	if (isFNEG(DAG, Op.getNode(), Depth))
	return 2;

	// Don't recurse exponentially.
	if (Depth > SelectionDAG::MaxRecursionDepth)
	return 0;

	EVT VT = Op.getValueType();
	EVT SVT = VT.getScalarType();
	switch (Op.getOpcode()) {
	case ISD::FMA:
	case X86ISD::FMSUB:
	case X86ISD::FNMADD:
	case X86ISD::FNMSUB:
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB_RND: {
	if (!Op.hasOneUse() \|\| !Subtarget.hasAnyFMA() \|\| !isTypeLegal(VT) \|\|
	!(SVT == MVT::f32 \|\| SVT == MVT::f64) \|\| !LegalOperations)
	break;

	// This is always negatible for free but we might be able to remove some
	// extra operand negations as well.
	for (int i = 0; i != 3; ++i) {
	char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
	ForCodeSize, Depth + 1);
	if (V == 2)
	return V;
	}
	return 1;
	}
	}

	return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
	ForCodeSize, Depth);
	}

	SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations,
	bool ForCodeSize,
	unsigned Depth) const {
	// fneg patterns are removable even if they have multiple uses.
	if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
	return DAG.getBitcast(Op.getValueType(), Arg);

	EVT VT = Op.getValueType();
	EVT SVT = VT.getScalarType();
	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	case ISD::FMA:
	case X86ISD::FMSUB:
	case X86ISD::FNMADD:
	case X86ISD::FNMSUB:
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB_RND: {
	if (!Op.hasOneUse() \|\| !Subtarget.hasAnyFMA() \|\| !isTypeLegal(VT) \|\|
	!(SVT == MVT::f32 \|\| SVT == MVT::f64) \|\| !LegalOperations)
	break;

	// This is always negatible for free but we might be able to remove some
	// extra operand negations as well.
	SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
	for (int i = 0; i != 3; ++i) {
	char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
	ForCodeSize, Depth + 1);
	if (V == 2)
	NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
	ForCodeSize, Depth + 1);
	}

	bool NegA = !!NewOps[0];
	bool NegB = !!NewOps[1];
	bool NegC = !!NewOps[2];
	unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

	// Fill in the non-negated ops with the original values.
	for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
	if (!NewOps[i])
	NewOps[i] = Op.getOperand(i);
	return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
	}
	}

	return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
	ForCodeSize, Depth);
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (!VT.isVector() \|\| !Subtarget.hasSSE2())
	return SDValue();

	SDLoc dl(N);

	unsigned IntBits = VT.getScalarSizeInBits();
	MVT IntSVT = MVT::getIntegerVT(IntBits);
	MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	if (!isOneConstant(N->getOperand(1)) \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	return combineFneg(N, DAG, Subtarget);
	}

	static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	unsigned NumBits = VT.getSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// TODO - Constant Folding.
	if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	// Reduce Cst1 to the bottom 16-bits.
	// NOTE: SimplifyDemandedBits won't do this for constants.
	const APInt &Val1 = Cst1->getAPIntValue();
	APInt MaskedVal1 = Val1 & 0xFFFF;
	if (MaskedVal1 != Val1)
	return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
	DAG.getConstant(MaskedVal1, SDLoc(N), VT));
	}

	// Only bottom 16-bits of the control bits are required.
	APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
	if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
	if (!DAG.getTarget().Options.NoNaNsFPMath \|\|
	!DAG.getTarget().Options.NoSignedZerosFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(VT.isVector() && TLI.isTypeLegal(VT))))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

	// If we don't have to respect NaN inputs, this is a direct translation to x86
	// min/max instructions.
	if (DAG.getTarget().Options.NoNaNsFPMath \|\| N->getFlags().hasNoNaNs())
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

	// If one of the operands is known non-NaN use the native min/max instructions
	// with the non-NaN input as second operand.
	if (DAG.isKnownNeverNaN(Op1))
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
	if (DAG.isKnownNeverNaN(Op0))
	return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

	// If we have to respect NaN inputs, this takes at least 3 instructions.
	// Favor a library call when operating on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	// Unless the load is volatile or atomic.
	if (LN->isSimple()) {
	SDLoc dl(N);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getIntegerVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
	LN->getPointerInfo(),
	LN->getAlignment(),
	LN->getMemOperand()->getFlags());
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// FIXME: Handle strict fp nodes.
	EVT VT = N->getValueType(0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(In);
	// Unless the load is volatile or atomic.
	if (LN->isSimple()) {
	SDLoc dl(N);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getFloatingPointVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
	LN->getPointerInfo(),
	LN->getAlignment(),
	LN->getMemOperand()->getFlags());
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);

	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Turn ANDNP back to AND if input is inverted.
	if (SDValue Not = IsNOT(N->getOperand(0), DAG))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
	N->getOperand(1));

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
	return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

	return SDValue();
	}

	// Try to combine sext_in_reg of a cmov of constants by extending the constants.
	static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	EVT DstVT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

	if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
	return SDValue();

	// Look through single use any_extends / truncs.
	SDValue IntermediateBitwidthOp;
	if ((N0.getOpcode() == ISD::ANY_EXTEND \|\| N0.getOpcode() == ISD::TRUNCATE) &&
	N0.hasOneUse()) {
	IntermediateBitwidthOp = N0;
	N0 = N0.getOperand(0);
	}

	// See if we have a single use cmov.
	if (N0.getOpcode() != X86ISD::CMOV \|\| !N0.hasOneUse())
	return SDValue();

	SDValue CMovOp0 = N0.getOperand(0);
	SDValue CMovOp1 = N0.getOperand(1);

	// Make sure both operands are constants.
	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	SDLoc DL(N);

	// If we looked through an any_extend/trunc above, add one to the constants.
	if (IntermediateBitwidthOp) {
	unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
	CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
	CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
	}

	CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
	CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

	EVT CMovVT = DstVT;
	// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
	if (DstVT == MVT::i16) {
	CMovVT = MVT::i32;
	CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
	CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
	}

	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
	N0.getOperand(2), N0.getOperand(3));

	if (CMovVT != DstVT)
	CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

	return CMov;
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	if (SDValue V = combineSextInRegCmov(N, DAG))
	return V;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV \|\| !CMovN.hasOneUse())
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	// Only extend to i32 or i64.
	if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
	return SDValue();

	// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
	// are free.
	if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
	return SDValue();

	// If this a zero extend to i64, we should only extend to i32 and use a free
	// zero extend to finish.
	EVT ExtendVT = TargetVT;
	if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
	ExtendVT = MVT::i32;

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

	SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));

	// Finish extending if needed.
	if (ExtendVT != TargetVT)
	Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

	return Res;
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	}
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
	// result type.
	static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// Only do this combine with AVX512 for vector extends.
	if (!Subtarget.hasAVX512() \|\| !VT.isVector() \|\| N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Only combine legal element types.
	EVT SVT = VT.getVectorElementType();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
	SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
	return SDValue();

	// We can only do this if the vector size in 256 bits or less.
	unsigned Size = VT.getSizeInBits();
	if (Size > 256)
	return SDValue();

	// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
	// that's the only integer compares with we have.
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	if (ISD::isUnsignedIntSetCC(CC))
	return SDValue();

	// Only do this combine if the extension will be fully consumed by the setcc.
	EVT N00VT = N0.getOperand(0).getValueType();
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (Size != MatchingVecType.getSizeInBits())
	return SDValue();

	SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

	if (N->getOpcode() == ISD::ZERO_EXTEND)
	Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());

	return Res;
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();
	if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
	V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
	return true;
	}
	// Look through extract_vector_elts. If it comes from an FNEG, create a
	// new extract from the FNEG input.
	if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isNullConstant(V.getOperand(1))) {
	SDValue Vec = V.getOperand(0);
	if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
	SDValue NegVal =
	TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
	V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
	NegVal, V.getOperand(1));
	return true;
	}
	}

	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = invertIfNegative(C);

	if (!NegA && !NegB && !NegC)
	return SDValue();

	unsigned NewOpcode =
	negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();

	SDValue N2 = N->getOperand(2);
	if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
	return SDValue();

	SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
	unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegN2, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegN2);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (DCI.isBeforeLegalizeOps())
	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	// TODO: Combine with any target/faux shuffle.
	if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
	VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
	if ((N00.isUndef() \|\| DAG.MaskedValueIsZero(N00, ZeroMask)) &&
	(N01.isUndef() \|\| DAG.MaskedValueIsZero(N01, ZeroMask))) {
	return concatSubVectors(N00, N01, DAG, dl);
	}
	}

	return SDValue();
	}

	/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
	/// recognizable memcmp expansion.
	static bool isOrXorXorTree(SDValue X, bool Root = true) {
	if (X.getOpcode() == ISD::OR)
	return isOrXorXorTree(X.getOperand(0), false) &&
	isOrXorXorTree(X.getOperand(1), false);
	if (Root)
	return false;
	return X.getOpcode() == ISD::XOR;
	}

	/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
	/// expansion.
	template<typename F>
	static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
	EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
	SDValue Op0 = X.getOperand(0);
	SDValue Op1 = X.getOperand(1);
	if (X.getOpcode() == ISD::OR) {
	SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
	SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
	if (VecVT != CmpVT)
	return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
	if (HasPT)
	return DAG.getNode(ISD::OR, DL, VecVT, A, B);
	return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
	} else if (X.getOpcode() == ISD::XOR) {
	SDValue A = SToV(Op0);
	SDValue B = SToV(Op1);
	if (VecVT != CmpVT)
	return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
	if (HasPT)
	return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
	return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
	}
	llvm_unreachable("Impossible");
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
	if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
	return SDValue();

	// Don't perform this combine if constructing the vector will be expensive.
	auto IsVectorBitCastCheap = [](SDValue X) {
	X = peekThroughBitcasts(X);
	return isa<ConstantSDNode>(X) \|\| X.getValueType().isVector() \|\|
	X.getOpcode() == ISD::LOAD;
	};
	if ((!IsVectorBitCastCheap(X) \|\| !IsVectorBitCastCheap(Y)) &&
	!IsOrXorXorTreeCCZero)
	return SDValue();

	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	bool HasAVX = Subtarget.hasAVX();

	// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
	// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
	// Otherwise use PCMPEQ (plus AND) and mask testing.
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && HasAVX) \|\|
	(OpSize == 512 && Subtarget.useAVX512Regs())) {
	bool HasPT = Subtarget.hasSSE41();

	// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
	// vector registers are essentially free. (Technically, widening registers
	// prevents load folding, but the tradeoff is worth it.)
	bool PreferKOT = Subtarget.preferMaskRegisters();
	bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

	EVT VecVT = MVT::v16i8;
	EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
	if (OpSize == 256) {
	VecVT = MVT::v32i8;
	CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
	}
	EVT CastVT = VecVT;
	bool NeedsAVX512FCast = false;
	if (OpSize == 512 \|\| NeedZExt) {
	if (Subtarget.hasBWI()) {
	VecVT = MVT::v64i8;
	CmpVT = MVT::v64i1;
	if (OpSize == 512)
	CastVT = VecVT;
	} else {
	VecVT = MVT::v16i32;
	CmpVT = MVT::v16i1;
	CastVT = OpSize == 512 ? VecVT :
	OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
	NeedsAVX512FCast = true;
	}
	}

	auto ScalarToVector = [&](SDValue X) -> SDValue {
	bool TmpZext = false;
	EVT TmpCastVT = CastVT;
	if (X.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue OrigX = X.getOperand(0);
	unsigned OrigSize = OrigX.getScalarValueSizeInBits();
	if (OrigSize < OpSize) {
	if (OrigSize == 128) {
	TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
	X = OrigX;
	TmpZext = true;
	} else if (OrigSize == 256) {
	TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
	X = OrigX;
	TmpZext = true;
	}
	}
	}
	X = DAG.getBitcast(TmpCastVT, X);
	if (!NeedZExt && !TmpZext)
	return X;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
	DAG.getConstant(0, DL, VecVT), X,
	DAG.getConstant(0, DL, VecIdxVT));
	};

	SDValue Cmp;
	if (IsOrXorXorTreeCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
	} else {
	SDValue VecX = ScalarToVector(X);
	SDValue VecY = ScalarToVector(Y);
	if (VecVT != CmpVT) {
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
	} else if (HasPT) {
	Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
	} else {
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
	}
	}
	// AVX512 should emit a setcc that will lower to kortest.
	if (VecVT != CmpVT) {
	EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
	CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
	return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
	DAG.getConstant(0, DL, KRegVT), CC);
	}
	if (HasPT) {
	SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
	Cmp);
	SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
	X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
	}
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	const SDValue LHS = N->getOperand(0);
	const SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Using temporaries to avoid messing up operand ordering for later
	// transformations if this doesn't work.
	SDValue Op0 = LHS;
	SDValue Op1 = RHS;
	ISD::CondCode TmpCC = CC;
	// Put build_vector on the right.
	if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(Op0, Op1);
	TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
	}

	bool IsSEXT0 =
	(Op0.getOpcode() == ISD::SIGN_EXTEND) &&
	(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == Op0.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (TmpCC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (TmpCC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (TmpCC == ISD::SETEQ \|\| TmpCC == ISD::SETGE)
	return DAG.getNOT(DL, Op0.getOperand(0), VT);

	assert((TmpCC == ISD::SETNE \|\| TmpCC == ISD::SETLT) &&
	"Unexpected condition code!");
	return Op0.getOperand(0);
	}
	}

	// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
	// pre-promote its result type since vXi1 vectors don't get promoted
	// during type legalization.
	// NOTE: The element count check is to ignore operand types that need to
	// go through type promotion to a 128-bit vector.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(OpVT.getVectorElementType() == MVT::i8 \|\|
	OpVT.getVectorElementType() == MVT::i16)) {
	SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = N->getSimpleValueType(0);
	unsigned NumBits = VT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// Perform constant folding.
	if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
	assert(VT == MVT::i32 && "Unexpected result type");
	APInt Imm(32, 0);
	for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
	if (!Src.getOperand(Idx).isUndef() &&
	Src.getConstantOperandAPInt(Idx).isNegative())
	Imm.setBit(Idx);
	}
	return DAG.getConstant(Imm, SDLoc(N), VT);
	}

	// Look through int->fp bitcasts that don't change the element width.
	unsigned EltWidth = SrcVT.getScalarSizeInBits();
	if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
	return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

	// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
	// with scalar comparisons.
	if (SDValue NotSrc = IsNOT(Src, DAG)) {
	SDLoc DL(N);
	APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
	NotSrc = DAG.getBitcast(SrcVT, NotSrc);
	return DAG.getNode(ISD::XOR, DL, VT,
	DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
	DAG.getConstant(NotMask, DL, VT));
	}

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// With vector masks we only demand the upper bit of the mask.
	SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDLoc DL(N);
	auto *GorS = cast<MaskedGatherScatterSDNode>(N);
	SDValue Chain = GorS->getChain();
	SDValue Index = GorS->getIndex();
	SDValue Mask = GorS->getMask();
	SDValue Base = GorS->getBasePtr();
	SDValue Scale = GorS->getScale();

	if (DCI.isBeforeLegalize()) {
	unsigned IndexWidth = Index.getScalarValueSizeInBits();

	// Shrink constant indices if they are larger than 32-bits.
	// Only do this before legalize types since v2i64 could become v2i32.
	// FIXME: We could check that the type is legal if we're after legalize
	// types, but then we would need to construct test cases where that happens.
	// FIXME: We could support more than just constant vectors, but we need to
	// careful with costing. A truncate that can be optimized out would be fine.
	// Otherwise we might only want to create a truncate if it avoids a split.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
	if (BV->isConstant() && IndexWidth > 32 &&
	DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
	unsigned NumElts = Index.getValueType().getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
	if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
	SDValue Ops[] = { Chain, Gather->getPassThru(),
	Mask, Base, Index, Scale } ;
	return DAG.getMaskedGather(Gather->getVTList(),
	Gather->getMemoryVT(), DL, Ops,
	Gather->getMemOperand(),
	Gather->getIndexType());
	}
	auto *Scatter = cast<MaskedScatterSDNode>(GorS);
	SDValue Ops[] = { Chain, Scatter->getValue(),
	Mask, Base, Index, Scale };
	return DAG.getMaskedScatter(Scatter->getVTList(),
	Scatter->getMemoryVT(), DL,
	Ops, Scatter->getMemOperand(),
	Scatter->getIndexType());
	}
	}

	// Shrink any sign/zero extends from 32 or smaller to larger than 32 if
	// there are sufficient sign bits. Only do this before legalize types to
	// avoid creating illegal types in truncate.
	if ((Index.getOpcode() == ISD::SIGN_EXTEND \|\|
	Index.getOpcode() == ISD::ZERO_EXTEND) &&
	IndexWidth > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
	DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
	unsigned NumElts = Index.getValueType().getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
	if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
	SDValue Ops[] = { Chain, Gather->getPassThru(),
	Mask, Base, Index, Scale } ;
	return DAG.getMaskedGather(Gather->getVTList(),
	Gather->getMemoryVT(), DL, Ops,
	Gather->getMemOperand(),
	Gather->getIndexType());
	}
	auto *Scatter = cast<MaskedScatterSDNode>(GorS);
	SDValue Ops[] = { Chain, Scatter->getValue(),
	Mask, Base, Index, Scale };
	return DAG.getMaskedScatter(Scatter->getVTList(),
	Scatter->getMemoryVT(), DL,
	Ops, Scatter->getMemOperand(),
	Scatter->getIndexType());
	}
	}

	if (DCI.isBeforeLegalizeOps()) {
	unsigned IndexWidth = Index.getScalarValueSizeInBits();

	// Make sure the index is either i32 or i64
	if (IndexWidth != 32 && IndexWidth != 64) {
	MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
	SDValue Ops[] = { Chain, Gather->getPassThru(),
	Mask, Base, Index, Scale } ;
	return DAG.getMaskedGather(Gather->getVTList(),
	Gather->getMemoryVT(), DL, Ops,
	Gather->getMemOperand(),
	Gather->getIndexType());
	}
	auto *Scatter = cast<MaskedScatterSDNode>(GorS);
	SDValue Ops[] = { Chain, Scatter->getValue(),
	Mask, Base, Index, Scale };
	return DAG.getMaskedScatter(Scatter->getVTList(),
	Scatter->getMemoryVT(), DL,
	Ops, Scatter->getMemOperand(),
	Scatter->getIndexType());
	}
	}

	// With vector masks we only demand the upper bit of the mask.
	if (Mask.getScalarValueSizeInBits() != 1) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	if (!VT.isVector() \|\| Op0->getOpcode() != ISD::AND \|\|
	Op0->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != Op0.getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst;
	if (IsStrict)
	SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
	{N->getOperand(0), SDValue(BV, 0)});
	else
	SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
	MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	if (IsStrict)
	return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
	return Res;
	}

	return SDValue();
	}

	/// If we are converting a value to floating-point, try to replace scalar
	/// truncate of an extracted vector element with a bitcast. This tries to keep
	/// the sequence on XMM registers rather than moving between vector and GPRs.
	static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
	// TODO: This is currently only used by combineSIntToFP, but it is generalized
	// to allow being called by any similar cast opcode.
	// TODO: Consider merging this into lowering: vectorizeExtractedCast().
	SDValue Trunc = N->getOperand(0);
	if (!Trunc.hasOneUse() \|\| Trunc.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue ExtElt = Trunc.getOperand(0);
	if (!ExtElt.hasOneUse() \|\| ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(ExtElt.getOperand(1)))
	return SDValue();

	EVT TruncVT = Trunc.getValueType();
	EVT SrcVT = ExtElt.getValueType();
	unsigned DestWidth = TruncVT.getSizeInBits();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	if (SrcWidth % DestWidth != 0)
	return SDValue();

	// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
	EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
	unsigned VecWidth = SrcVecVT.getSizeInBits();
	unsigned NumElts = VecWidth / DestWidth;
	EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
	SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
	SDLoc DL(N);
	SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
	BitcastVec, ExtElt.getOperand(1));
	return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), P});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0)) {
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
	{N->getOperand(0), Op0});
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
	}

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	bool IsStrict = N->isStrictFPOpcode();
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), P});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = MVT::i32;
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	if (DCI.isBeforeLegalize() \|\| TruncVT != MVT::v2i32) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), Trunc});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	// If we're after legalize and the type is v2i32 we need to shuffle and
	// use CVTSI2P.
	assert(InVT == MVT::v2i64 && "Unexpected VT!");
	SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
	{ 0, 2, -1, -1 });
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
	{N->getOperand(0), Shuf});
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
	Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	// If we have AVX512DQ we can use packed conversion instructions unless
	// the VT is f80.
	if (Subtarget.hasDQI() && VT != MVT::f80)
	return SDValue();

	if (Ld->isSimple() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
	return Tmp.first;
	}
	}

	if (IsStrict)
	return SDValue();

	if (SDValue V = combineToFPTruncExtElt(N, DAG))
	return V;

	return SDValue();
	}

	static bool needCarryOrOverflowFlag(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	X86::CondCode CC;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return true;
	case X86ISD::SETCC:
	case X86ISD::SETCC_CARRY:
	CC = (X86::CondCode)User->getConstantOperandVal(0);
	break;
	case X86ISD::BRCOND:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	case X86ISD::CMOV:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	}

	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	return true;
	}
	}

	return false;
	}

	static bool onlyZeroFlagUsed(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	unsigned CCOpNo;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return false;
	}

	return true;
	}

	static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
	// Only handle test patterns.
	if (!isNullConstant(N->getOperand(1)))
	return SDValue();

	// If we have a CMP of a truncated binop, see if we can make a smaller binop
	// and use its flags directly.
	// TODO: Maybe we should try promoting compares that only use the zero flag
	// first if we can prove the upper bits with computeKnownBits?
	SDLoc dl(N);
	SDValue Op = N->getOperand(0);
	EVT VT = Op.getValueType();

	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if ((Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) &&
	Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
	onlyZeroFlagUsed(SDValue(N, 0))) {
	unsigned BitWidth = VT.getSizeInBits();
	const APInt &ShAmt = Op.getConstantOperandAPInt(1);
	if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
	unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
	APInt Mask = Op.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, MaskBits)
	: APInt::getLowBitsSet(BitWidth, MaskBits);
	if (Mask.isSignedIntN(32)) {
	Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));
	}
	}
	}

	// Look for a truncate with a single use.
	if (Op.getOpcode() != ISD::TRUNCATE \|\| !Op.hasOneUse())
	return SDValue();

	Op = Op.getOperand(0);

	// Arithmetic op can only have one use.
	if (!Op.hasOneUse())
	return SDValue();

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default: return SDValue();
	case ISD::AND:
	// Skip and with constant. We have special handling for and with immediate
	// during isel to generate test instructions.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	NewOpc = X86ISD::AND;
	break;
	case ISD::OR: NewOpc = X86ISD::OR; break;
	case ISD::XOR: NewOpc = X86ISD::XOR; break;
	case ISD::ADD:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::ADD;
	break;
	case ISD::SUB:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::SUB;
	break;
	}

	// We found an op we can narrow. Truncate its inputs.
	SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

	// Use a X86 specific opcode to avoid DAG combine messing with it.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

	// For AND, keep a CMP so that we can match the test pattern.
	if (NewOpc == X86ISD::AND)
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));

	// Return the flags.
	return Op.getValue(1);
	}

	static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert((X86ISD::ADD == N->getOpcode() \|\| X86ISD::SUB == N->getOpcode()) &&
	"Expected X86ISD::ADD or X86ISD::SUB");

	SDLoc DL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

	// If we don't use the flag result, simplify back to a generic ADD/SUB.
	if (!N->hasAnyUseOfValue(1)) {
	SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
	return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
	}

	// Fold any similar generic ADD/SUB opcodes to reuse this node.
	auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
	SDValue Ops[] = {N0, N1};
	SDVTList VTs = DAG.getVTList(N->getValueType(0));
	if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
	SDValue Op(N, 0);
	if (Negate)
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	DCI.CombineTo(GenericAddSub, Op);
	}
	};
	MatchGeneric(LHS, RHS, false);
	MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
	// iff the flag result is dead.
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
	!N->hasAnyUseOfValue(1))
	return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
	Op0.getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 =
	DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> adc X, 0
	// X - SETB Z --> sbb X, 0
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), NewEFLAGS);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT VT = N->getValueType(0);

	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (!VT.isVector() \|\| VT.getVectorNumElements() < 8)
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	auto UsePMADDWD = [&](SDValue Op) {
	ShrinkMode Mode;
	return Op.getOpcode() == ISD::MUL &&
	canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
	Mode != ShrinkMode::MULU16 &&
	(!Subtarget.hasSSE41() \|\|
	(Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
	Op->isOnlyUserOf(Op.getOperand(1).getNode())));
	};

	SDValue MulOp, OtherOp;
	if (UsePMADDWD(Op0)) {
	MulOp = Op0;
	OtherOp = Op1;
	} else if (UsePMADDWD(Op1)) {
	MulOp = Op1;
	OtherOp = Op0;
	} else
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

	// Madd vector size is half of the original vector size
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};
	SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
	PMADDWDBuilder);
	// Fill the rest of the output with 0
	SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);

	// Preserve the reduction flag on the ADD. We may need to revisit for the
	// other operand.
	SDNodeFlags Flags;
	Flags.setVectorReduction(true);
	return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.useBWIRegs())
	RegSize = 512;
	else if (Subtarget.hasAVX())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add. To match SAD, we need one of the operands to
	// be an ABS.
	SDValue AbsOp = N->getOperand(0);
	SDValue OtherOp = N->getOperand(1);
	if (AbsOp.getOpcode() != ISD::ABS)
	std::swap(AbsOp, OtherOp);
	if (AbsOp.getOpcode() != ISD::ABS)
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue SadOp0, SadOp1;
	if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
	return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
	// the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
	// result to v2i32 which will be removed by type legalization. If we/ widen
	// narrow vectors then we bitcast to v4i32 and extract v2i32.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.
	assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
	unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
	Ops[0] = Sad;
	Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
	} else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
	Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
	DAG.getIntPtrConstant(0, DL));
	}

	// Preserve the reduction flag on the ADD. We may need to revisit for the
	// other operand.
	SDNodeFlags Flags;
	Flags.setVectorReduction(true);
	return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
	}

	static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	// Example of pattern we try to detect:
	// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
	//(add (build_vector (extract_elt t, 0),
	// (extract_elt t, 2),
	// (extract_elt t, 4),
	// (extract_elt t, 6)),
	// (build_vector (extract_elt t, 1),
	// (extract_elt t, 3),
	// (extract_elt t, 5),
	// (extract_elt t, 7)))

	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Op0.getOpcode() != ISD::BUILD_VECTOR \|\|
	Op1.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Check if one of Op0,Op1 is of the form:
	// (build_vector (extract_elt Mul, 0),
	// (extract_elt Mul, 2),
	// (extract_elt Mul, 4),
	// ...
	// the other is of the form:
	// (build_vector (extract_elt Mul, 1),
	// (extract_elt Mul, 3),
	// (extract_elt Mul, 5),
	// ...
	// and identify Mul.
	SDValue Mul;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
	SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
	Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
	// TODO: Be more tolerant to undefs.
	if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
	auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
	auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
	auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
	if (!Const0L \|\| !Const1L \|\| !Const0H \|\| !Const1H)
	return SDValue();
	unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
	Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
	// Commutativity of mul allows factors of a product to reorder.
	if (Idx0L > Idx1L)
	std::swap(Idx0L, Idx1L);
	if (Idx0H > Idx1H)
	std::swap(Idx0H, Idx1H);
	// Commutativity of add allows pairs of factors to reorder.
	if (Idx0L > Idx0H) {
	std::swap(Idx0L, Idx0H);
	std::swap(Idx1L, Idx1H);
	}
	if (Idx0L != 2 * i \|\| Idx1L != 2 * i + 1 \|\| Idx0H != 2 * i + 2 \|\|
	Idx1H != 2 * i + 3)
	return SDValue();
	if (!Mul) {
	// First time an extract_elt's source vector is visited. Must be a MUL
	// with 2X number of vector elements than the BUILD_VECTOR.
	// Both extracts must be from same MUL.
	Mul = Op0L->getOperand(0);
	if (Mul->getOpcode() != ISD::MUL \|\|
	Mul.getValueType().getVectorNumElements() != 2 * e)
	return SDValue();
	}
	// Check that the extract is from the same MUL previously seen.
	if (Mul != Op0L->getOperand(0) \|\| Mul != Op1L->getOperand(0) \|\|
	Mul != Op0H->getOperand(0) \|\| Mul != Op1H->getOperand(0))
	return SDValue();
	}

	// Check if the Mul source can be safely shrunk.
	ShrinkMode Mode;
	if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) \|\|
	Mode == ShrinkMode::MULU16)
	return SDValue();

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i32 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements() / 2);
	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements());
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
	DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
	DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Mul.getOperand(0), Mul.getOperand(1) },
	PMADDBuilder);
	}

	// Attempt to turn this pattern into PMADDWD.
	// (mul (add (sext (build_vector)), (sext (build_vector))),
	// (add (sext (build_vector)), (sext (build_vector)))
	static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// All inputs need to be sign extends.
	// TODO: Support ZERO_EXTEND from known positive?
	if (N00.getOpcode() != ISD::SIGN_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::SIGN_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Must be extending from vXi16.
	EVT InVT = N00.getValueType();
	if (InVT.getVectorElementType() != MVT::i16 \|\| N01.getValueType() != InVT \|\|
	N10.getValueType() != InVT \|\| N11.getValueType() != InVT)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue In0, In1;
	for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!In0) {
	In0 = N00In;
	In1 = N01In;
	}
	// Mul is commutative so the input vectors can be in any order.
	// Canonicalize to make the compares easier.
	if (In0 != N00In)
	std::swap(N00In, N01In);
	if (In0 != N10In)
	std::swap(N10In, N11In);
	if (In0 != N00In \|\| In1 != N01In \|\| In0 != N10In \|\| In1 != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT OpVT = Ops[0].getValueType();
	assert(OpVT.getScalarType() == MVT::i16 &&
	"Unexpected scalar element type");
	assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	OpVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
	PMADDBuilder);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;
	if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
	auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	HADDBuilder);
	}

	// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
	// (sub Y, (sext (vXi1 X))).
	// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
	// generic DAG combine without a legal type check, but adding this there
	// caused regressions.
	if (VT.isVector()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
	Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
	TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
	SDLoc DL(N);
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
	}

	if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
	Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
	TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
	SDLoc DL(N);
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
	}
	}

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	// PSUBUS is supported, starting from SSE2, but truncation for v8i32
	// is only worth it with SSSE3 (PSHUFB).
	EVT EltVT = VT.getVectorElementType();
	if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 \|\| EltVT == MVT::i16)) &&
	!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 \|\| VT == MVT::v8i64)) &&
	!(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR canonicalization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else
	return SDValue();

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16)
	return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);

	assert((VT == MVT::v8i32 \|\| VT == MVT::v16i32 \|\| VT == MVT::v8i64) &&
	"Unexpected VT!");

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known = DAG.computeKnownBits(SubusLHS);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
	NewSubusLHS, NewSubusRHS);

	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	const APInt &XorC = Op1.getConstantOperandAPInt(1);
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
	auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	HSUBBuilder);
	}

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return DAG.getConstant(-1, DL, VT);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return DAG.getConstant(0, DL, VT);
	}

	return SDValue();
	}

	/// Helper that combines an array of subvector ops as if they were the operands
	/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
	/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
	static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
	ArrayRef<SDValue> Ops, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");

	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	if (llvm::all_of(Ops, [](SDValue Op) {
	return ISD::isBuildVectorAllZeros(Op.getNode());
	}))
	return getZeroVector(VT, Subtarget, DAG, DL);

	SDValue Op0 = Ops[0];

	// Fold subvector loads into one.
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
	bool Fast;
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*FirstLd->getMemOperand(), &Fast) &&
	Fast) {
	if (SDValue Ld =
	EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
	return Ld;
	}
	}

	// Repeated subvectors.
	if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
	// If this broadcast/subv_broadcast is inserted into both halves, use a
	// larger broadcast/subv_broadcast.
	if (Op0.getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
	return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

	// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
	if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
	(Subtarget.hasAVX2() \|\| MayFoldLoad(Op0.getOperand(0))))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
	Op0.getOperand(0),
	DAG.getIntPtrConstant(0, DL)));

	// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
	if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Subtarget.hasAVX2() \|\|
	(VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
	Op0.getOperand(0).getValueType() == VT.getScalarType())
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
	}

	bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

	// Repeated opcode.
	// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
	// but it currently struggles with different vector widths.
	if (llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOpcode() == Op0.getOpcode();
	})) {
	unsigned NumOps = Ops.size();
	switch (Op0.getOpcode()) {
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFD:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	LLVM_FALLTHROUGH;
	case X86ISD::VPERMILPI:
	// TODO - add support for vXf64/vXi64 shuffles.
	if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 \|\| VT == MVT::v8i32) &&
	Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
	Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
	Op0.getOperand(1));
	return DAG.getBitcast(VT, Res);
	}
	break;
	case X86ISD::PACKUS:
	if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumOps * SrcVT.getVectorNumElements());
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
	}
	break;
	}
	}

	return SDValue();
	}

	static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Don't do anything for i1 vectors.
	if (VT.getVectorElementType() == MVT::i1)
	return SDValue();

	if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
	SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
	if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
	DCI, Subtarget))
	return R;
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	uint64_t IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (Vec.isUndef() && SubVec.isUndef())
	return DAG.getUNDEF(OpVT);

	// Inserting undefs/zeros into zeros/undefs is a zero vector.
	if ((Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())) &&
	(SubVec.isUndef() \|\| ISD::isBuildVectorAllZeros(SubVec.getNode())))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting into a zero vector and our input was extracted from an
	// insert into a zero vector of the same type and the extraction was at
	// least as large as the original insertion. Just insert the original
	// subvector into a zero vector.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
	isNullConstant(SubVec.getOperand(1)) &&
	SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Ins = SubVec.getOperand(0);
	if (isNullConstant(Ins.getOperand(2)) &&
	ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
	Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	Ins.getOperand(1), N->getOperand(2));
	}
	}

	// Stop here if this is an i1 vector.
	if (IsI1Vector)
	return SDValue();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Match concat_vector style patterns.
	SmallVector<SDValue, 2> SubVectorOps;
	if (collectConcatOps(N, SubVectorOps)) {
	if (SDValue Fold =
	combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
	return Fold;

	// If we're inserting all zeros into the upper half, change this to
	// a concat with zero. We will match this to a move
	// with implicit upper bit zeroing during isel.
	// We do this here because we don't want combineConcatVectorOps to
	// create INSERT_SUBVECTOR from CONCAT_VECTORS.
	if (SubVectorOps.size() == 2 &&
	ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
	}

	// If this is a broadcast insert into an upper undef, use a larger broadcast.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
	return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

	// If this is a broadcast load inserted into an upper undef, use a larger
	// broadcast load.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
	SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
	SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
	SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}

	return SDValue();
	}

	/// If we are extracting a subvector of a vector select and the select condition
	/// is composed of concatenated vectors, try to narrow the select width. This
	/// is a common pattern for AVX1 integer code because 256-bit selects may be
	/// legal, but there is almost no integer math/logic available for 256-bit.
	/// This function should only be called with legal types (otherwise, the calls
	/// to get simple value types will assert).
	static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
	SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
	SmallVector<SDValue, 4> CatOps;
	if (Sel.getOpcode() != ISD::VSELECT \|\|
	!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
	return SDValue();

	// Note: We assume simple value types because this should only be called with
	// legal operations/types.
	// TODO: This can be extended to handle extraction to 256-bits.
	MVT VT = Ext->getSimpleValueType(0);
	if (!VT.is128BitVector())
	return SDValue();

	MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
	if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
	return SDValue();

	MVT WideVT = Ext->getOperand(0).getSimpleValueType();
	MVT SelVT = Sel.getSimpleValueType();
	assert((SelVT.is256BitVector() \|\| SelVT.is512BitVector()) &&
	"Unexpected vector type with legal operations");

	unsigned SelElts = SelVT.getVectorNumElements();
	unsigned CastedElts = WideVT.getVectorNumElements();
	unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
	if (SelElts % CastedElts == 0) {
	// The select has the same or more (narrower) elements than the extract
	// operand. The extraction index gets scaled by that factor.
	ExtIdx *= (SelElts / CastedElts);
	} else if (CastedElts % SelElts == 0) {
	// The select has less (wider) elements than the extract operand. Make sure
	// that the extraction index can be divided evenly.
	unsigned IndexDivisor = CastedElts / SelElts;
	if (ExtIdx % IndexDivisor != 0)
	return SDValue();
	ExtIdx /= IndexDivisor;
	} else {
	llvm_unreachable("Element count of simple vector types are not divisible?");
	}

	unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
	unsigned NarrowElts = SelElts / NarrowingFactor;
	MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
	SDLoc DL(Ext);
	SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
	SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
	SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
	SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
	return DAG.getBitcast(VT, NarrowSel);
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// For AVX1 only, if we are extracting from a 256-bit and+not (which will
	// eventually get combined/lowered into ANDNP) with a concatenated operand,
	// split the 'and' into 128-bit ops to avoid the concatenate and extract.
	// We let generic combining take over from there to simplify the
	// insert/extract and 'not'.
	// This pattern emerges during AVX1 legalization. We handle it before lowering
	// to avoid complications like splitting constant vector loads.

	// Capture the original wide type in the likely case that we need to bitcast
	// back to this type.
	if (!N->getValueType(0).isSimple())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDValue InVec = N->getOperand(0);
	SDValue InVecBC = peekThroughBitcasts(InVec);
	EVT InVecVT = InVec.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
	TLI.isTypeLegal(InVecVT) &&
	InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
	auto isConcatenatedNot = [] (SDValue V) {
	V = peekThroughBitcasts(V);
	if (!isBitwiseNot(V))
	return false;
	SDValue NotOp = V->getOperand(0);
	return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
	};
	if (isConcatenatedNot(InVecBC.getOperand(0)) \|\|
	isConcatenatedNot(InVecBC.getOperand(1))) {
	// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
	SDValue Concat = split256IntArith(InVecBC, DAG);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
	DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = narrowExtractedVectorSelect(N, DAG))
	return V;

	unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (VT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), VT);
	return getOnesVector(VT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	VT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

	// If we are extracting from an insert into a zero vector, replace with a
	// smaller insert into zero if we don't access less than the original
	// subvector. Don't do this for i1 vectors.
	if (VT.getVectorElementType() != MVT::i1 &&
	InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
	InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
	ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
	InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
	SDLoc DL(N);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL),
	InVec.getOperand(1), InVec.getOperand(2));
	}

	// If we're extracting from a broadcast then we're better off just
	// broadcasting to the smaller type directly, assuming this is the only use.
	// As its a broadcast we don't care about the extraction index.
	if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
	InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));

	if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
	if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
	MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}
	}

	// If we're extracting the lowest subvector and we're the only user,
	// we may be able to perform this with a smaller vector width.
	if (IdxVal == 0 && InVec.hasOneUse()) {
	unsigned InOpcode = InVec.getOpcode();
	if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
	// v2f64 CVTDQ2PD(v4i32).
	if (InOpcode == ISD::SINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTUDQ2PD(v4i32).
	if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTPS2PD(v4f32).
	if (InOpcode == ISD::FP_EXTEND &&
	InVec.getOperand(0).getValueType() == MVT::v4f32) {
	return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
	}
	}
	if ((InOpcode == ISD::ANY_EXTEND \|\|
	InOpcode == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::ZERO_EXTEND \|\|
	InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::SIGN_EXTEND \|\|
	InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	VT.is128BitVector() &&
	InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
	unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
	return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
	}
	if (InOpcode == ISD::VSELECT &&
	InVec.getOperand(0).getValueType().is256BitVector() &&
	InVec.getOperand(1).getValueType().is256BitVector() &&
	InVec.getOperand(2).getValueType().is256BitVector()) {
	SDLoc DL(N);
	SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
	SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
	SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
	return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
	}
	}

	return SDValue();
	}

	static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
	// This occurs frequently in our masked scalar intrinsic code and our
	// floating point select lowering with AVX512.
	// TODO: SimplifyDemandedBits instead?
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->getAPIntValue().isOneValue())
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
	Src.getOperand(0));

	// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
	Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->isNullValue())
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
	Src.getOperand(1));

	// Reduce v2i64 to v4i32 if we don't need the upper bits.
	// TODO: Move to DAGCombine?
	if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
	Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
	Src.getOperand(0).getScalarValueSizeInBits() <= 32)
	return DAG.getBitcast(
	VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
	DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));

	return SDValue();
	}

	// Simplify PMULDQ and PMULUDQ operations.
	static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// Canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(RHS))
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

	// Multiply by zero.
	// Don't return RHS as it may contain UNDEFs.
	if (ISD::isBuildVectorAllZeros(RHS.getNode()))
	return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
	return SDValue(N, 0);

	// If the input is an extend_invec and the SimplifyDemandedBits call didn't
	// convert it to any_extend_invec, due to the LegalOperations check, do the
	// conversion directly to a vector shuffle manually. This exposes combine
	// opportunities missed by combineExtInVec not calling
	// combineX86ShufflesRecursively on SSE4.1 targets.
	// FIXME: This is basically a hack around several other issues related to
	// ANY_EXTEND_VECTOR_INREG.
	if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
	(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	LHS.getOperand(0).getValueType() == MVT::v4i32) {
	SDLoc dl(N);
	LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
	LHS.getOperand(0), { 0, -1, 1, -1 });
	LHS = DAG.getBitcast(MVT::v2i64, LHS);
	return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
	}
	if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
	(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	RHS.getOperand(0).getValueType() == MVT::v4i32) {
	SDLoc dl(N);
	RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
	RHS.getOperand(0), { 0, -1, 1, -1 });
	RHS = DAG.getBitcast(MVT::v2i64, RHS);
	return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
	}

	return SDValue();
	}

	static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Try to merge vector loads and extend_inreg to an extload.
	if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
	In.hasOneUse()) {
	auto *Ld = cast<LoadSDNode>(In);
	if (Ld->isSimple()) {
	MVT SVT = In.getSimpleValueType().getVectorElementType();
	ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
	EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
	VT.getVectorNumElements());
	if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
	SDValue Load =
	DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	return Load;
	}
	}
	}

	// Attempt to combine as a shuffle.
	// TODO: SSE41 support
	if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
	SDValue Op(N, 0);
	if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);

	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::SCALAR_TO_VECTOR:
	return combineScalarToVector(N, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::CONCAT_VECTORS:
	return combineConcatVectors(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case X86ISD::CMP: return combineCMP(N, DAG);
	case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
	case X86ISD::ADD:
	case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL: return combineShiftLeft(N, DAG);
	case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
	case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	return combineSIntToFP(N, DAG, DCI, Subtarget);
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::CVTTP2SI:
	case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
	Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHL:
	case X86ISD::VSRA:
	case X86ISD::VSRL:
	return combineVectorShiftVar(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB:
	case X86ISD::FNMSUB_RND:
	case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
	case X86ISD::KSHIFTL:
	case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
	}

	return SDValue();
	}

	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;

	// There are no vXi8 shifts.
	if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
	return false;

	// TODO: Almost no 8-bit ops are desirable because they have no actual
	// size/speed advantages vs. 32-bit ops, but they do have a major
	// potential disadvantage by causing partial register stalls.
	//
	// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
	// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
	// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
	// check for a constant operand to the multiply.
	if ((Opc == ISD::MUL \|\| Opc == ISD::SHL) && VT == MVT::i8)
	return false;

	// i16 instruction encodings are longer and some i16 instructions are slow,
	// so those are not desirable.
	if (VT == MVT::i16) {
	switch (Opc) {
	default:
	break;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	// Any legal type not explicitly accounted for above here is desirable.
	return true;
	}

	SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
	SDValue Value, SDValue Addr,
	SelectionDAG &DAG) const {
	const Module *M = DAG.getMachineFunction().getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
	if (IsCFProtectionSupported) {
	// In case control-flow branch protection is enabled, we need to add
	// notrack prefix to the indirect branch.
	// In order to do that we create NT_BRIND SDNode.
	// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
	return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
	}

	return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
	}

	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
	isa<ConstantSDNode>(Op.getOperand(1));

	// i16 is legal, but undesirable since i16 instruction encodings are longer
	// and some i16 instructions are slow.
	// 8-bit multiply-by-constant can usually be expanded to something cheaper
	// using LEA and/or other ALU ops.
	if (VT != MVT::i16 && !Is8BitMulByConstant)
	return false;

	auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (!ISD::isNormalStore(User))
	return false;
	auto *Ld = cast<LoadSDNode>(Load);
	auto *St = cast<StoreSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
	if (!Load.hasOneUse() \|\| Load.getOpcode() != ISD::ATOMIC_LOAD)
	return false;
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (User->getOpcode() != ISD::ATOMIC_STORE)
	return false;
	auto *Ld = cast<AtomicSDNode>(Load);
	auto *St = cast<AtomicSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	bool Commute = false;
	switch (Op.getOpcode()) {
	default: return false;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
	return false;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N1) &&
	(!Commute \|\| !isa<ConstantSDNode>(N0) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
	return false;
	if (MayFoldLoad(N0) &&
	((Commute && !isa<ConstantSDNode>(N1)) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
	return false;
	if (IsFoldableAtomicRMW(N0, Op) \|\|
	(Commute && IsFoldableAtomicRMW(N1, Op)))
	return false;
	}
	}

	PVT = MVT::i32;
	return true;
	}

	bool X86TargetLowering::
	isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

	assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
	"Element count mismatch");
	assert(
	Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
	"Shuffle Mask expected to be legal");

	// For 32-bit elements VPERMD is better than shuffle+truncate.
	// TODO: After we improve lowerBuildVector, add execption for VPERMW.
	if (SrcVT.getScalarSizeInBits() == 32 \|\| !Subtarget.hasAVX2())
	return false;

	if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
	X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
	.Case("{@cca}", X86::COND_A)
	.Case("{@ccae}", X86::COND_AE)
	.Case("{@ccb}", X86::COND_B)
	.Case("{@ccbe}", X86::COND_BE)
	.Case("{@ccc}", X86::COND_B)
	.Case("{@cce}", X86::COND_E)
	.Case("{@ccz}", X86::COND_E)
	.Case("{@ccg}", X86::COND_G)
	.Case("{@ccge}", X86::COND_GE)
	.Case("{@ccl}", X86::COND_L)
	.Case("{@ccle}", X86::COND_LE)
	.Case("{@ccna}", X86::COND_BE)
	.Case("{@ccnae}", X86::COND_B)
	.Case("{@ccnb}", X86::COND_AE)
	.Case("{@ccnbe}", X86::COND_A)
	.Case("{@ccnc}", X86::COND_AE)
	.Case("{@ccne}", X86::COND_NE)
	.Case("{@ccnz}", X86::COND_NE)
	.Case("{@ccng}", X86::COND_LE)
	.Case("{@ccnge}", X86::COND_L)
	.Case("{@ccnl}", X86::COND_GE)
	.Case("{@ccnle}", X86::COND_G)
	.Case("{@ccno}", X86::COND_NO)
	.Case("{@ccnp}", X86::COND_P)
	.Case("{@ccns}", X86::COND_NS)
	.Case("{@cco}", X86::COND_O)
	.Case("{@ccp}", X86::COND_P)
	.Case("{@ccs}", X86::COND_S)
	.Default(X86::COND_INVALID);
	return Cond;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'N':
	case 'G':
	case 'L':
	case 'M':
	return C_Immediate;
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	case '0':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return C_Other;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y': {
	unsigned Size = StringRef(constraint).size();
	// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
	char NextChar = Size == 2 ? constraint[1] : 'i';
	if (Size > 2)
	break;
	switch (NextChar) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	case '0':
	if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'Y'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	// Fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	}
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	// Lower @cc targets via setcc.
	SDValue X86TargetLowering::LowerAsmOutputForConstraint(
	SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const {
	X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
	if (Cond == X86::COND_INVALID)
	return SDValue();
	// Check that return type is valid.
	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
	OpInfo.ConstraintVT.getSizeInBits() < 8)
	report_fatal_error("Flag output operand is of invalid type");

	// Get EFLAGS register. Only update chain when copyfrom is glued.
	if (Flag.getNode()) {
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
	Chain = Flag.getValue(1);
	} else
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
	// Extract CC code.
	SDValue CC = getSETCC(Cond, Flag, DL, DAG);
	// Extend to 32-bits
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

	return Result;
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
	BooleanContent BCont = getBooleanContents(MVT::i64);
	ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
	: ISD::SIGN_EXTEND;
	int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
	: CST->getSExtValue();
	Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(
	Subtarget.classifyGlobalReference(GA->getGlobal())))
	return;
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	/// Check if \p RC is a mask register class.
	/// I.e., VK* or one of their variant.
	static bool isVKClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::VK1RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK2RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK4RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK8RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK16RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK32RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK64RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// 'A' means [ER]AX + [ER]DX.
	case 'A':
	if (Subtarget.is64Bit())
	return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1RegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16RegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32RegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle i128 in FR128RegClass after it is tested well.
	// Vector types and fp128.
	case MVT::f128:
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	if (Subtarget.hasAVX())
	return std::make_pair(0U, &X86::VR256RegClass);
	break;
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512()) break;
	if (VConstraint)
	return std::make_pair(0U, &X86::VR512RegClass);
	return std::make_pair(0U, &X86::VR512_0_15RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "Y", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	case '0':
	if (!Subtarget.hasSSE1()) break;
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1WMRegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8WMRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16WMRegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32WMRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	break;
	}
	}

	if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return std::make_pair(0U, &X86::GR32RegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' && Constraint[6] == '}') {
	// st(7) is not allocatable and thus not a member of RFP80. Return
	// singleton class in cases where we have a reference to it.
	if (Constraint[4] == '7')
	return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
	return std::make_pair(X86::FP0 + Constraint[4] - '0',
	&X86::RFP80RegClass);
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint))
	return std::make_pair(X86::FP0, &X86::RFP80RegClass);

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint))
	return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

	// dirflag -> DF
	if (StringRef("{dirflag}").equals_lower(Constraint))
	return std::make_pair(X86::DF, &X86::DFCCRRegClass);

	// fpsr -> FPSW
	if (StringRef("{fpsr}").equals_lower(Constraint))
	return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

	return Res;
	}

	// Make sure it isn't a register that requires 64-bit mode.
	if (!Subtarget.is64Bit() &&
	(isFRClass(Res.second) \|\| isGRClass(Res.second)) &&
	TRI->getEncodingValue(Res.first) >= 8) {
	// Register requires REX prefix, but we're in 32-bit mode.
	return std::make_pair(0, nullptr);
	}

	// Make sure it isn't a register that requires AVX512.
	if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
	TRI->getEncodingValue(Res.first) & 0x10) {
	// Register requires EVEX prefix.
	return std::make_pair(0, nullptr);
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
	: nullptr;
	if (Size == 64 && !is64Bit) {
	// Model GCC's behavior here and select a fixed pair of 32-bit
	// registers.
	switch (DestReg) {
	case X86::RAX:
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
	case X86::RDX:
	return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
	case X86::RCX:
	return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
	case X86::RBX:
	return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
	case X86::RSI:
	return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
	case X86::RDI:
	return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
	case X86::RBP:
	return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
	default:
	return std::make_pair(0, nullptr);
	}
	}
	if (RC && RC->contains(DestReg))
	return std::make_pair(DestReg, RC);
	return Res;
	}
	// No register found/type mismatch.
	return std::make_pair(0, nullptr);
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32XRegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
	Res.second = &X86::VR128XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
	Res.second = &X86::VR256XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isVKClass(*Class)) {
	if (VT == MVT::i1)
	Res.second = &X86::VK1RegClass;
	else if (VT == MVT::i8)
	Res.second = &X86::VK8RegClass;
	else if (VT == MVT::i16)
	Res.second = &X86::VK16RegClass;
	else if (VT == MVT::i32)
	Res.second = &X86::VK32RegClass;
	else if (VT == MVT::i64)
	Res.second = &X86::VK64RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%rdx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	Register NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(
	Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef
	X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}

	unsigned
	X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
	// The default stack probe size is 4096 if the function has no stackprobesize
	// attribute.
	unsigned StackProbeSize = 4096;
	const Function &Fn = MF.getFunction();
	if (Fn.hasFnAttribute("stack-probe-size"))
	Fn.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);
	return StackProbeSize;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h (revision 362609)
	@@ -1,1719 +1,1719 @@
	//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
	#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H

	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLowering.h"

	namespace llvm {
	class X86Subtarget;
	class X86TargetMachine;

	namespace X86ISD {
	// X86 Specific DAG Nodes
	enum NodeType : unsigned {
	// Start the numbering where the builtin ops leave off.
	FIRST_NUMBER = ISD::BUILTIN_OP_END,

	/// Bit scan forward.
	BSF,
	/// Bit scan reverse.
	BSR,

	/// Double shift instructions. These correspond to
	/// X86::SHLDxx and X86::SHRDxx instructions.
	SHLD,
	SHRD,

	/// Bitwise logical AND of floating point values. This corresponds
	/// to X86::ANDPS or X86::ANDPD.
	FAND,

	/// Bitwise logical OR of floating point values. This corresponds
	/// to X86::ORPS or X86::ORPD.
	FOR,

	/// Bitwise logical XOR of floating point values. This corresponds
	/// to X86::XORPS or X86::XORPD.
	FXOR,

	/// Bitwise logical ANDNOT of floating point values. This
	/// corresponds to X86::ANDNPS or X86::ANDNPD.
	FANDN,

	/// These operations represent an abstract X86 call
	/// instruction, which includes a bunch of information. In particular the
	/// operands of these node are:
	///
	/// #0 - The incoming token chain
	/// #1 - The callee
	/// #2 - The number of arg bytes the caller pushes on the stack.
	/// #3 - The number of arg bytes the callee pops off the stack.
	/// #4 - The value to pass in AL/AX/EAX (optional)
	/// #5 - The value to pass in DL/DX/EDX (optional)
	///
	/// The result values of these nodes are:
	///
	/// #0 - The outgoing token chain
	/// #1 - The first register result value (optional)
	/// #2 - The second register result value (optional)
	///
	CALL,

	/// Same as call except it adds the NoTrack prefix.
	NT_CALL,

	/// X86 compare and logical compare instructions.
	CMP, COMI, UCOMI,

	/// X86 bit-test instructions.
	BT,

	/// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
	/// operand, usually produced by a CMP instruction.
	SETCC,

	/// X86 Select
	SELECTS,

	// Same as SETCC except it's materialized with a sbb and the value is all
	// one's or all zero's.
	SETCC_CARRY, // R = carry_bit ? ~0 : 0

	/// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
	/// Operands are two FP values to compare; result is a mask of
	/// 0s or 1s. Generally DTRT for C/C++ with NaNs.
	FSETCC,

	/// X86 FP SETCC, similar to above, but with output as an i1 mask and
	/// and a version with SAE.
	FSETCCM, FSETCCM_SAE,

	/// X86 conditional moves. Operand 0 and operand 1 are the two values
	/// to select from. Operand 2 is the condition code, and operand 3 is the
	/// flag operand produced by a CMP or TEST instruction.
	CMOV,

	/// X86 conditional branches. Operand 0 is the chain operand, operand 1
	/// is the block to branch if condition is true, operand 2 is the
	/// condition code, and operand 3 is the flag operand produced by a CMP
	/// or TEST instruction.
	BRCOND,

	/// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
	/// operand 1 is the target address.
	NT_BRIND,

	/// Return with a flag operand. Operand 0 is the chain operand, operand
	/// 1 is the number of bytes of stack to pop.
	RET_FLAG,

	/// Return from interrupt. Operand 0 is the number of bytes to pop.
	IRET,

	/// Repeat fill, corresponds to X86::REP_STOSx.
	REP_STOS,

	/// Repeat move, corresponds to X86::REP_MOVSx.
	REP_MOVS,

	/// On Darwin, this node represents the result of the popl
	/// at function entry, used for PIC code.
	GlobalBaseReg,

	/// A wrapper node for TargetConstantPool, TargetJumpTable,
	/// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
	/// MCSymbol and TargetBlockAddress.
	Wrapper,

	/// Special wrapper used under X86-64 PIC mode for RIP
	/// relative displacements.
	WrapperRIP,

	/// Copies a 64-bit value from an MMX vector to the low word
	/// of an XMM vector, with the high word zero filled.
	MOVQ2DQ,

	/// Copies a 64-bit value from the low word of an XMM vector
	/// to an MMX vector.
	MOVDQ2Q,

	/// Copies a 32-bit value from the low word of a MMX
	/// vector to a GPR.
	MMX_MOVD2W,

	/// Copies a GPR into the low 32-bit word of a MMX vector
	/// and zero out the high word.
	MMX_MOVW2D,

	/// Extract an 8-bit value from a vector and zero extend it to
	/// i32, corresponds to X86::PEXTRB.
	PEXTRB,

	/// Extract a 16-bit value from a vector and zero extend it to
	/// i32, corresponds to X86::PEXTRW.
	PEXTRW,

	/// Insert any element of a 4 x float vector into any element
	/// of a destination 4 x floatvector.
	INSERTPS,

	/// Insert the lower 8-bits of a 32-bit value to a vector,
	/// corresponds to X86::PINSRB.
	PINSRB,

	/// Insert the lower 16-bits of a 32-bit value to a vector,
	/// corresponds to X86::PINSRW.
	PINSRW,

	/// Shuffle 16 8-bit values within a vector.
	PSHUFB,

	/// Compute Sum of Absolute Differences.
	PSADBW,
	/// Compute Double Block Packed Sum-Absolute-Differences
	DBPSADBW,

	/// Bitwise Logical AND NOT of Packed FP values.
	ANDNP,

	/// Blend where the selector is an immediate.
	BLENDI,

	/// Dynamic (non-constant condition) vector blend where only the sign bits
	/// of the condition elements are used. This is used to enforce that the
	/// condition mask is not valid for generic VSELECT optimizations. This
	/// is also used to implement the intrinsics.
	/// Operands are in VSELECT order: MASK, TRUE, FALSE
	BLENDV,

	/// Combined add and sub on an FP vector.
	ADDSUB,

	// FP vector ops with rounding mode.
	FADD_RND, FADDS, FADDS_RND,
	FSUB_RND, FSUBS, FSUBS_RND,
	FMUL_RND, FMULS, FMULS_RND,
	FDIV_RND, FDIVS, FDIVS_RND,
	FMAX_SAE, FMAXS_SAE,
	FMIN_SAE, FMINS_SAE,
	FSQRT_RND, FSQRTS, FSQRTS_RND,

	// FP vector get exponent.
	FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
	// Extract Normalized Mantissas.
	VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
	// FP Scale.
	SCALEF, SCALEF_RND,
	SCALEFS, SCALEFS_RND,

	// Unsigned Integer average.
	AVG,

	/// Integer horizontal add/sub.
	HADD,
	HSUB,

	/// Floating point horizontal add/sub.
	FHADD,
	FHSUB,

	// Detect Conflicts Within a Vector
	CONFLICT,

	/// Floating point max and min.
	FMAX, FMIN,

	/// Commutative FMIN and FMAX.
	FMAXC, FMINC,

	/// Scalar intrinsic floating point max and min.
	FMAXS, FMINS,

	/// Floating point reciprocal-sqrt and reciprocal approximation.
	/// Note that these typically require refinement
	/// in order to obtain suitable precision.
	FRSQRT, FRCP,

	// AVX-512 reciprocal approximations with a little more precision.
	RSQRT14, RSQRT14S, RCP14, RCP14S,

	// Thread Local Storage.
	TLSADDR,

	// Thread Local Storage. A call to get the start address
	// of the TLS block for the current module.
	TLSBASEADDR,

	// Thread Local Storage. When calling to an OS provided
	// thunk at the address from an earlier relocation.
	TLSCALL,

	// Exception Handling helpers.
	EH_RETURN,

	// SjLj exception handling setjmp.
	EH_SJLJ_SETJMP,

	// SjLj exception handling longjmp.
	EH_SJLJ_LONGJMP,

	// SjLj exception handling dispatch.
	EH_SJLJ_SETUP_DISPATCH,

	/// Tail call return. See X86TargetLowering::LowerCall for
	/// the list of operands.
	TC_RETURN,

	// Vector move to low scalar and zero higher vector elements.
	VZEXT_MOVL,

	// Vector integer truncate.
	VTRUNC,
	// Vector integer truncate with unsigned/signed saturation.
	VTRUNCUS, VTRUNCS,

	// Masked version of the above. Used when less than a 128-bit result is
	// produced since the mask only applies to the lower elements and can't
	// be represented by a select.
	// SRC, PASSTHRU, MASK
	VMTRUNC, VMTRUNCUS, VMTRUNCS,

	// Vector FP extend.
	VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,

	// Vector FP round.
	VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,

	// Masked version of above. Used for v2f64->v4f32.
	// SRC, PASSTHRU, MASK
	VMFPROUND,

	// 128-bit vector logical left / right shift
	VSHLDQ, VSRLDQ,

	// Vector shift elements
	VSHL, VSRL, VSRA,

	// Vector variable shift
	VSHLV, VSRLV, VSRAV,

	// Vector shift elements by immediate
	VSHLI, VSRLI, VSRAI,

	// Shifts of mask registers.
	KSHIFTL, KSHIFTR,

	// Bit rotate by immediate
	VROTLI, VROTRI,

	// Vector packed double/float comparison.
	CMPP,

	// Vector integer comparisons.
	PCMPEQ, PCMPGT,

	// v8i16 Horizontal minimum and position.
	PHMINPOS,

	MULTISHIFT,

	/// Vector comparison generating mask bits for fp and
	/// integer signed and unsigned data types.
	CMPM,
	// Vector comparison with SAE for FP values
	CMPM_SAE,

	// Arithmetic operations with FLAGS results.
	ADD, SUB, ADC, SBB, SMUL, UMUL,
	OR, XOR, AND,

	// Bit field extract.
	BEXTR,

	// Zero High Bits Starting with Specified Bit Position.
	BZHI,

	// X86-specific multiply by immediate.
	MUL_IMM,

	// Vector sign bit extraction.
	MOVMSK,

	// Vector bitwise comparisons.
	PTEST,

	// Vector packed fp sign bitwise comparisons.
	TESTP,

	// OR/AND test for masks.
	KORTEST,
	KTEST,

	// ADD for masks.
	KADD,

	// Several flavors of instructions with vector shuffle behaviors.
	// Saturated signed/unnsigned packing.
	PACKSS,
	PACKUS,
	// Intra-lane alignr.
	PALIGNR,
	// AVX512 inter-lane alignr.
	VALIGN,
	PSHUFD,
	PSHUFHW,
	PSHUFLW,
	SHUFP,
	// VBMI2 Concat & Shift.
	VSHLD,
	VSHRD,
	VSHLDV,
	VSHRDV,
	//Shuffle Packed Values at 128-bit granularity.
	SHUF128,
	MOVDDUP,
	MOVSHDUP,
	MOVSLDUP,
	MOVLHPS,
	MOVHLPS,
	MOVSD,
	MOVSS,
	UNPCKL,
	UNPCKH,
	VPERMILPV,
	VPERMILPI,
	VPERMI,
	VPERM2X128,

	// Variable Permute (VPERM).
	// Res = VPERMV MaskV, V0
	VPERMV,

	// 3-op Variable Permute (VPERMT2).
	// Res = VPERMV3 V0, MaskV, V1
	VPERMV3,

	// Bitwise ternary logic.
	VPTERNLOG,
	// Fix Up Special Packed Float32/64 values.
	VFIXUPIMM, VFIXUPIMM_SAE,
	VFIXUPIMMS, VFIXUPIMMS_SAE,
	// Range Restriction Calculation For Packed Pairs of Float32/64 values.
	VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
	// Reduce - Perform Reduction Transformation on scalar\packed FP.
	VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
	// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
	// Also used by the legacy (V)ROUND intrinsics where we mask out the
	// scaling part of the immediate.
	VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
	// Tests Types Of a FP Values for packed types.
	VFPCLASS,
	// Tests Types Of a FP Values for scalar types.
	VFPCLASSS,

	// Broadcast (splat) scalar or element 0 of a vector. If the operand is
	// a vector, this node may change the vector length as part of the splat.
	VBROADCAST,
	// Broadcast mask to vector.
	VBROADCASTM,
	// Broadcast subvector to vector.
	SUBV_BROADCAST,

	/// SSE4A Extraction and Insertion.
	EXTRQI, INSERTQI,

	// XOP arithmetic/logical shifts.
	VPSHA, VPSHL,
	// XOP signed/unsigned integer comparisons.
	VPCOM, VPCOMU,
	// XOP packed permute bytes.
	VPPERM,
	// XOP two source permutation.
	VPERMIL2,

	// Vector multiply packed unsigned doubleword integers.
	PMULUDQ,
	// Vector multiply packed signed doubleword integers.
	PMULDQ,
	// Vector Multiply Packed UnsignedIntegers with Round and Scale.
	MULHRS,

	// Multiply and Add Packed Integers.
	VPMADDUBSW, VPMADDWD,

	// AVX512IFMA multiply and add.
	// NOTE: These are different than the instruction and perform
	// op0 x op1 + op2.
	VPMADD52L, VPMADD52H,

	// VNNI
	VPDPBUSD,
	VPDPBUSDS,
	VPDPWSSD,
	VPDPWSSDS,

	// FMA nodes.
	// We use the target independent ISD::FMA for the non-inverted case.
	FNMADD,
	FMSUB,
	FNMSUB,
	FMADDSUB,
	FMSUBADD,

	// FMA with rounding mode.
	FMADD_RND,
	FNMADD_RND,
	FMSUB_RND,
	FNMSUB_RND,
	FMADDSUB_RND,
	FMSUBADD_RND,

	// Compress and expand.
	COMPRESS,
	EXPAND,

	// Bits shuffle
	VPSHUFBITQMB,

	// Convert Unsigned/Integer to Floating-Point Value with rounding mode.
	SINT_TO_FP_RND, UINT_TO_FP_RND,
	SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
	SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,

	// Vector float/double to signed/unsigned integer.
	CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
	// Scalar float/double to signed/unsigned integer.
	CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,

	// Vector float/double to signed/unsigned integer with truncation.
	CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
	// Scalar float/double to signed/unsigned integer with truncation.
	CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,

	// Vector signed/unsigned integer to float/double.
	CVTSI2P, CVTUI2P,

	// Masked versions of above. Used for v2f64->v4f32.
	// SRC, PASSTHRU, MASK
	MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
	MCVTSI2P, MCVTUI2P,

	// Vector float to bfloat16.
	// Convert TWO packed single data to one packed BF16 data
	CVTNE2PS2BF16,
	// Convert packed single data to packed BF16 data
	CVTNEPS2BF16,
	// Masked version of above.
	// SRC, PASSTHRU, MASK
	MCVTNEPS2BF16,

	// Dot product of BF16 pairs to accumulated into
	// packed single precision.
	DPBF16PS,

	// Save xmm argument registers to the stack, according to %al. An operator
	// is needed so that this can be expanded with control flow.
	VASTART_SAVE_XMM_REGS,

	// Windows's _chkstk call to do stack probing.
	WIN_ALLOCA,

	// For allocating variable amounts of stack space when using
	// segmented stacks. Check if the current stacklet has enough space, and
	// falls back to heap allocation if not.
	SEG_ALLOCA,

	// Memory barriers.
	MEMBARRIER,
	MFENCE,

	// Store FP status word into i16 register.
	FNSTSW16r,

	// Store contents of %ah into %eflags.
	SAHF,

	// Get a random integer and indicate whether it is valid in CF.
	RDRAND,

	// Get a NIST SP800-90B & C compliant random integer and
	// indicate whether it is valid in CF.
	RDSEED,

	// Protection keys
	// RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
	// WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
	// value for ECX.
	RDPKRU, WRPKRU,

	// SSE42 string comparisons.
	// These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
	// will emit one or two instructions based on which results are used. If
	// flags and index/mask this allows us to use a single instruction since
	// we won't have to pick and opcode for flags. Instead we can rely on the
	// DAG to CSE everything and decide at isel.
	PCMPISTR,
	PCMPESTR,

	// Test if in transactional execution.
	XTEST,

	// ERI instructions.
	RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
	RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,

	// Conversions between float and half-float.
	CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,

	// Masked version of above.
	// SRC, RND, PASSTHRU, MASK
	MCVTPS2PH,

	// Galois Field Arithmetic Instructions
	GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,

	// LWP insert record.
	LWPINS,

	// User level wait
	UMWAIT, TPAUSE,

	// Enqueue Stores Instructions
	ENQCMD, ENQCMDS,

	// For avx512-vp2intersect
	VP2INTERSECT,

	/// X86 strict FP compare instructions.
	STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
	STRICT_FCMPS,

	// Vector packed double/float comparison.
	STRICT_CMPP,

	/// Vector comparison generating mask bits for fp and
	/// integer signed and unsigned data types.
	STRICT_CMPM,

	// Vector float/double to signed/unsigned integer with truncation.
	STRICT_CVTTP2SI, STRICT_CVTTP2UI,

	// Vector FP extend.
	STRICT_VFPEXT,

	// Vector FP round.
	STRICT_VFPROUND,

	// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
	// Also used by the legacy (V)ROUND intrinsics where we mask out the
	// scaling part of the immediate.
	STRICT_VRNDSCALE,

	// Vector signed/unsigned integer to float/double.
	STRICT_CVTSI2P, STRICT_CVTUI2P,

	// Compare and swap.
	LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
	LCMPXCHG8_DAG,
	LCMPXCHG16_DAG,
	LCMPXCHG8_SAVE_EBX_DAG,
	LCMPXCHG16_SAVE_RBX_DAG,

	/// LOCK-prefixed arithmetic read-modify-write instructions.
	/// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
	LADD, LSUB, LOR, LXOR, LAND,

	// Load, scalar_to_vector, and zero extend.
	VZEXT_LOAD,

	// extract_vector_elt, store.
	VEXTRACT_STORE,

	// scalar broadcast from memory
	VBROADCAST_LOAD,

	// Store FP control world into i16 memory.
	FNSTCW16m,

	/// This instruction implements FP_TO_SINT with the
	/// integer destination in memory and a FP reg source. This corresponds
	/// to the X86::FIST*m instructions and the rounding mode change stuff. It
	/// has two inputs (token chain and address) and two outputs (int value
	/// and token chain). Memory VT specifies the type to store to.
	FP_TO_INT_IN_MEM,

	/// This instruction implements SINT_TO_FP with the
	/// integer source in memory and FP reg result. This corresponds to the
	/// X86::FILD*m instructions. It has two inputs (token chain and address)
	/// and two outputs (FP value and token chain). FILD_FLAG also produces a
	/// flag). The integer source type is specified by the memory VT.
	FILD,
	FILD_FLAG,

	/// This instruction implements a fp->int store from FP stack
	/// slots. This corresponds to the fist instruction. It takes a
	/// chain operand, value to store, address, and glue. The memory VT
	/// specifies the type to store as.
	FIST,

	/// This instruction implements an extending load to FP stack slots.
	/// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
	/// operand, and ptr to load from. The memory VT specifies the type to
	/// load from.
	FLD,

	/// This instruction implements a truncating store from FP stack
	/// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
	/// chain operand, value to store, address, and glue. The memory VT
	/// specifies the type to store as.
	FST,

	/// This instruction grabs the address of the next argument
	/// from a va_list. (reads and modifies the va_list in memory)
	VAARG_64,

	// Vector truncating store with unsigned/signed saturation
	VTRUNCSTOREUS, VTRUNCSTORES,
	// Vector truncating masked store with unsigned/signed saturation
	VMTRUNCSTOREUS, VMTRUNCSTORES,

	// X86 specific gather and scatter
	MGATHER, MSCATTER,

	// WARNING: Do not add anything in the end unless you want the node to
	// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
	// opcodes will be thought as target memory ops!
	};
	} // end namespace X86ISD

	/// Define some predicates that are used for node matching.
	namespace X86 {
	/// Returns true if Elt is a constant zero or floating point constant +0.0.
	bool isZeroNode(SDValue Elt);

	/// Returns true of the given offset can be
	/// fit into displacement field of the instruction.
	bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement = true);

	/// Determines whether the callee is required to pop its
	/// own arguments. Callee pop is necessary to support tail calls.
	bool isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO);

	/// If Op is a constant whose elements are all the same constant or
	/// undefined, return true and return the constant value in \p SplatVal.
	bool isConstantSplat(SDValue Op, APInt &SplatVal);
	} // end namespace X86

	//===--------------------------------------------------------------------===//
	// X86 Implementation of the TargetLowering interface
	class X86TargetLowering final : public TargetLowering {
	public:
	explicit X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI);

	unsigned getJumpTableEncoding() const override;
	bool useSoftFloat() const override;

	void markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const override;

	MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
	return MVT::i8;
	}

	const MCExpr *
	LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB, unsigned uid,
	MCContext &Ctx) const override;

	/// Returns relocation base for the given PIC jumptable.
	SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const override;
	const MCExpr *
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI, MCContext &Ctx) const override;

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contains are placed at 16-byte boundaries while the rest are at
	/// 4-byte boundaries.
	unsigned getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const override;

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const override;

	/// Returns true if it's safe to use load / store of the
	/// specified type to expand memcpy / memset inline. This is mostly true
	/// for all types except for some special cases. For example, on X86
	/// targets without SSE2 f64 load / store are done with fldl / fstpl which
	/// also does type conversion. Note the specified type doesn't have to be
	/// legal as the hook is used before type legalization.
	bool isSafeMemOpType(MVT VT) const override;

	/// Returns true if the target allows unaligned memory accesses of the
	/// specified type. Returns whether it is "fast" in the last argument.
	bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
	MachineMemOperand::Flags Flags,
	bool *Fast) const override;

	/// Provide custom lowering hooks for some operations.
	///
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	/// Replace the results of node with an illegal result
	/// type with new values built out of custom code.
	///
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const override;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	// Return true if it is profitable to combine a BUILD_VECTOR with a
	// stride-pattern to a shuffle and a truncate.
	// Example of such a combine:
	// v4i32 build_vector((extract_elt V, 1),
	// (extract_elt V, 3),
	// (extract_elt V, 5),
	// (extract_elt V, 7))
	// -->
	// v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
	// v4i64)
	bool isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;

	/// Return true if the target has native support for
	/// the specified value type and it is 'desirable' to use the type for the
	/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
	/// instruction encodings are longer and some i16 instructions are slow.
	bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;

	/// Return true if the target has native support for the
	/// specified value type and it is 'desirable' to use the type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer
	/// and some i16 instructions are slow.
	bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;

	/// Return 1 if we can compute the negated form of the specified expression
	/// for the same cost as the expression itself, or 2 if we can compute the
	/// negated form more cheaply than the expression itself. Else return 0.
	char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
	bool ForCodeSize, unsigned Depth) const override;

	/// If isNegatibleForFree returns true, return the newly negated expression.
	SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations, bool ForCodeSize,
	unsigned Depth) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const override;

	/// This method returns the name of a target specific DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	/// Do not merge vector stores after legalization because that may conflict
	/// with x86-specific store splitting optimizations.
	bool mergeStoresAfterLegalization(EVT MemVT) const override {
	return !MemVT.isVector();
	}

	bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const override;

	bool isCheapToSpeculateCttz() const override;

	bool isCheapToSpeculateCtlz() const override;

	bool isCtlzFast() const override;

	bool hasBitPreservingFPLogic(EVT VT) const override {
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT.isVector();
	}

	bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
	// If the pair to store is a mixture of float and int values, we will
	// save two bitwise instructions and one float-to-int instruction and
	// increase one store instruction. There is potentially a more
	// significant benefit because it avoids the float->int domain switch
	// for input value. So It is more likely a win.
	if ((LTy.isFloatingPoint() && HTy.isInteger()) \|\|
	(LTy.isInteger() && HTy.isFloatingPoint()))
	return true;
	// If the pair only contains int values, we will save two bitwise
	// instructions and increase one store instruction (costing one more
	// store buffer). Since the benefit is more blurred so we leave
	// such pair out until we get testcase to prove it is a win.
	return false;
	}

	bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;

	bool hasAndNotCompare(SDValue Y) const override;

	bool hasAndNot(SDValue Y) const override;

	bool hasBitTest(SDValue X, SDValue Y) const override;

	bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
	SelectionDAG &DAG) const override;

	bool shouldFoldConstantShiftPairToMask(const SDNode *N,
	CombineLevel Level) const override;

	bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;

	bool
	shouldTransformSignedTruncationCheck(EVT XVT,
	unsigned KeptBits) const override {
	// For vectors, we don't have a preference..
	if (XVT.isVector())
	return false;

	auto VTIsOk = [](EVT VT) -> bool {
	return VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\|
	VT == MVT::i64;
	};

	// We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
	// XVT will be larger than KeptBitsVT.
	MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
	return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
	}

	bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;

	bool shouldSplatInsEltVarIndex(EVT VT) const override;

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
	MVT hasFastEqualityCompare(unsigned NumBits) const override;

	/// Return the value type to use for ISD::SETCC.
	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const override;

	/// Determine which of the bits specified in Mask are known to be either
	/// zero or one and return them in the KnownZero/KnownOne bitsets.
	void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	/// Determine the number of bits in the operation that are sign bits.
	unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const override;

	bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	APInt &KnownUndef,
	APInt &KnownZero,
	TargetLoweringOpt &TLO,
	unsigned Depth) const override;

	bool SimplifyDemandedBitsForTargetNode(SDValue Op,
	const APInt &DemandedBits,
	const APInt &DemandedElts,
	KnownBits &Known,
	TargetLoweringOpt &TLO,
	unsigned Depth) const override;

	SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	SelectionDAG &DAG, unsigned Depth) const override;

	const Constant getTargetConstantFromLoad(LoadSDNode LD) const override;

	SDValue unwrapAddress(SDValue N) const override;

	SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;

	bool ExpandInlineAsm(CallInst *CI) const override;

	ConstraintType getConstraintType(StringRef Constraint) const override;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	ConstraintWeight
	getSingleConstraintMatchWeight(AsmOperandInfo &info,
	const char *constraint) const override;

	const char *LowerXConstraint(EVT ConstraintVT) const override;

	/// Lower the specified operand into the Ops vector. If it is invalid, don't
	/// add anything to Ops. If hasMemory is true it means one of the asm
	/// constraint of the inline asm instruction being processed is 'm'.
	void LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	unsigned
	getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
	if (ConstraintCode == "o")
	return InlineAsm::Constraint_o;
	else if (ConstraintCode == "v")
	return InlineAsm::Constraint_v;
	else if (ConstraintCode == "X")
	return InlineAsm::Constraint_X;
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	/// Handle Lowering flag assembly outputs.
	SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
	const AsmOperandInfo &Constraint,
	SelectionDAG &DAG) const override;

	/// Given a physical register constraint
	/// (e.g. {edx}), return the register number and the register class for the
	/// register. This should only be used for C_Register constraints. On
	/// error, this returns a register number of 0.
	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	/// Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS,
	Instruction *I = nullptr) const override;

	/// Return true if the specified immediate is legal
	/// icmp immediate, that is the target has icmp instructions which can
	/// compare a register against the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalICmpImmediate(int64_t Imm) const override;

	/// Return true if the specified immediate is legal
	/// add immediate, that is the target has add instructions which can
	/// add a register and the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalAddImmediate(int64_t Imm) const override;

	bool isLegalStoreImmediate(int64_t Imm) const override;

	/// Return the cost of the scaling factor used in the addressing
	/// mode represented by AM for this target, for a load/store
	/// of the specified type.
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, it returns a negative value.
	int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS) const override;

	bool isVectorShiftByScalarCheap(Type *Ty) const override;

	/// Add x86-specific opcodes to the default list.
	bool isBinOp(unsigned Opcode) const override;

	/// Returns true if the opcode is a commutative binary operation.
	bool isCommutativeBinOp(unsigned Opcode) const override;

	/// Return true if it's free to truncate a value of
	/// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
	/// register EAX to i16 by referencing its sub-register AX.
	bool isTruncateFree(Type Ty1, Type Ty2) const override;
	bool isTruncateFree(EVT VT1, EVT VT2) const override;

	bool allowTruncateForTailCall(Type Ty1, Type Ty2) const override;

	/// Return true if any actual instruction that defines a
	/// value of type Ty1 implicit zero-extends the value to Ty2 in the result
	/// register. This does not necessarily include registers defined in
	/// unknown ways, such as incoming arguments, or copies from unknown
	/// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
	/// does not necessarily apply to truncate instructions. e.g. on x86-64,
	/// all instructions that define 32-bit values implicit zero-extend the
	/// result out to 64 bits.
	bool isZExtFree(Type Ty1, Type Ty2) const override;
	bool isZExtFree(EVT VT1, EVT VT2) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;

	/// Return true if folding a vector load into ExtVal (a sign, zero, or any
	/// extend node) is profitable.
	bool isVectorLoadExtDesirable(SDValue) const override;

	/// Return true if an FMA operation is faster than a pair of fmul and fadd
	/// instructions. fmuladd intrinsics will be expanded to FMAs when this
	/// method returns true, otherwise fmuladd is expanded to fmul + fadd.
	bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const override;

	/// Return true if it's profitable to narrow
	/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
	/// from i32 to i8 but not from i32 to i16.
	bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;

	/// Given an intrinsic, checks if on the target the intrinsic will need to map
	/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
	/// true and stores the intrinsic information into the IntrinsicInfo that was
	/// passed to the function.
	bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const override;

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
	/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
	/// be legal.
	bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;

	/// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
	/// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
	/// constant pool entry.
	bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;

	/// Returns true if lowering to a jump table is allowed.
	bool areJTsAllowed(const Function *Fn) const override;

	/// If true, then instruction selection should
	/// seek to shrink the FP constant of the specified type to a smaller type
	/// in order to save space and / or reduce runtime.
	bool ShouldShrinkFPConstant(EVT VT) const override {
	// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
	// expensive than a straight movsd. On the other hand, it's important to
	// shrink long double fp constant since fldt is very slow.
	return !X86ScalarSSEf64 \|\| VT == MVT::f80;
	}

	/// Return true if we believe it is correct and profitable to reduce the
	/// load node to a smaller type.
	bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
	EVT NewVT) const override;

	/// Return true if the specified scalar FP type is computed in an SSE
	/// register, not on the X87 floating point stack.
	bool isScalarFPTypeInSSEReg(EVT VT) const {
	return (VT == MVT::f64 && X86ScalarSSEf64) \|\| // f64 is when SSE2
	(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override;

	bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;

	bool convertSelectOfConstantsToMath(EVT VT) const override;

	bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
	SDValue C) const override;

	/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
	/// with this index.
	bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const override;

	/// Scalar ops always have equal or better analysis/performance/power than
	/// the vector equivalent, so this always makes sense if the scalar op is
	/// supported.
	bool shouldScalarizeBinop(SDValue) const override;

	/// Extract of a scalar FP value from index 0 of a vector is free.
	bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
	EVT EltVT = VT.getScalarType();
	return (EltVT == MVT::f32 \|\| EltVT == MVT::f64) && Index == 0;
	}

	/// Overflow nodes should get combined/lowered to optimal instructions
	/// (they should allow eliminating explicit compares by getting flags from
	/// math ops).
	bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;

	bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
	unsigned AddrSpace) const override {
	// If we can replace more than 2 scalar stores, there will be a reduction
	// in instructions even after we add a vector constant load.
	return NumElem > 2;
	}

	bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const override;

	/// Intel processors have a unified instruction and data cache
	const char * getClearCacheBuiltinName() const override {
	return nullptr; // nothing to do, move along.
	}

	Register getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const override;

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	virtual bool needsFixedCatchObjects() const override;

	/// This method returns a target specific FastISel object,
	/// or null if the target does not support "fast" ISel.
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const override;

	/// If the target has a standard location for the stack protector cookie,
	/// returns the address of that location. Otherwise, returns nullptr.
	Value *getIRStackGuard(IRBuilder<> &IRB) const override;

	bool useLoadStackGuardNode() const override;
	bool useStackGuardXorFP() const override;
	void insertSSPDeclarations(Module &M) const override;
	Value *getSDagStackGuard(const Module &M) const override;
	Function *getSSPStackGuardCheck(const Module &M) const override;
	SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const override;


	/// Return true if the target stores SafeStack pointer at a fixed offset in
	/// some non-standard address space, and populates the address space and
	/// offset as appropriate.
	Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;

	std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const;

	bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;

	/// Customize the preferred legalization strategy for certain types.
	LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;

	MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
	EVT VT) const override;

	unsigned getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const override;

	unsigned getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const override;

	bool isIntDivCheap(EVT VT, AttributeList Attr) const override;

	bool supportSwiftError() const override;

	StringRef getStackProbeSymbolName(MachineFunction &MF) const override;

	unsigned getStackProbeSize(MachineFunction &MF) const;

	bool hasVectorBlend() const override { return true; }

	unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

	/// Lower interleaved load(s) into target specific
	/// instructions/intrinsics.
	bool lowerInterleavedLoad(LoadInst *LI,
	ArrayRef<ShuffleVectorInst *> Shuffles,
	ArrayRef<unsigned> Indices,
	unsigned Factor) const override;

	/// Lower interleaved store(s) into target specific
	/// instructions/intrinsics.
	bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
	unsigned Factor) const override;

	SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
	SDValue Addr, SelectionDAG &DAG)
	const override;

	protected:
	std::pair<const TargetRegisterClass *, uint8_t>
	findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const override;

	private:
	/// Keep a reference to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget &Subtarget;

	/// Select between SSE or x87 floating point ops.
	/// When SSE is available, use it for f32 operations.
	/// When SSE2 is available, use it for f64 operations.
	bool X86ScalarSSEf32;
	bool X86ScalarSSEf64;

	/// A list of legal FP immediates.
	std::vector<APFloat> LegalFPImmediates;

	/// Indicate that this x86 target can instruction
	/// select the specified FP immediate natively.
	void addLegalFPImmediate(const APFloat& Imm) {
	LegalFPImmediates.push_back(Imm);
	}

	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
	CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const;
	SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &ArgInfo,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA, MachineFrameInfo &MFI,
	unsigned i) const;
	SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const;

	// Call lowering helpers.

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	bool isCalleeStructRet,
	bool isCallerStructRet,
	Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;
	SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
	SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff,
	const SDLoc &dl) const;

	unsigned GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG &DAG) const;

	unsigned getAddressSpace(void) const;

	SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned,
	SDValue &Chain) const;

	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;

	unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
	const unsigned char OpFlags = 0) const;
	SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const;

	SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const;

	SDValue
	LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;
	SDValue LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const override;

	bool supportSplitCSR(MachineFunction *MF) const override {
	return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
	}
	void initializeSplitCSR(MachineBasicBlock *Entry) const override;
	void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

	bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;

	bool mayBeEmittedAsTailCall(const CallInst *CI) const override;

	EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const override;

	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;

	const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;

	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
	bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

	LoadInst *
	lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

	bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
	bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;

	bool needsCmpXchgNb(Type *MemType) const;

	void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB, int FI) const;

	// Utility function to emit the low-level va_arg code for X86-64.
	MachineBasicBlock *
	EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Utility function to emit the xmm reg save portion of va_start.
	MachineBasicBlock *
	EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
	MachineInstr &MI2,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	- MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
	- MachineBasicBlock *BB) const;
	+ MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
	+ MachineBasicBlock *BB) const;

	MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	void emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Convert a comparison if required by the subtarget.
	SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	SDValue &X86CC, SDValue &Chain,
	bool IsSignaling) const;

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;

	/// Use rsqrt* to speed up sqrt calculations.
	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps, bool &UseOneConstNR,
	bool Reciprocal) const override;

	/// Use rcp* to speed up fdiv calculations.
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const override;

	/// Reassociate floating point divisions into multiply by reciprocal.
	unsigned combineRepeatedFPDivisors() const override;

	SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const override;
	};

	namespace X86 {
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo);
	} // end namespace X86

	// Base class for all X86 non-masked store operations.
	class X86StoreSDNode : public MemSDNode {
	public:
	X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	:MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
	const SDValue &getValue() const { return getOperand(1); }
	const SDValue &getBasePtr() const { return getOperand(2); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTORES \|\|
	N->getOpcode() == X86ISD::VTRUNCSTOREUS;
	}
	};

	// Base class for all X86 masked store operations.
	// The class has the same order of operands as MaskedStoreSDNode for
	// convenience.
	class X86MaskedStoreSDNode : public MemSDNode {
	public:
	X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}

	const SDValue &getValue() const { return getOperand(1); }
	const SDValue &getBasePtr() const { return getOperand(2); }
	const SDValue &getMask() const { return getOperand(3); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTORES \|\|
	N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
	}
	};

	// X86 Truncating Store with Signed saturation.
	class TruncSStoreSDNode : public X86StoreSDNode {
	public:
	TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
	: X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTORES;
	}
	};

	// X86 Truncating Store with Unsigned saturation.
	class TruncUSStoreSDNode : public X86StoreSDNode {
	public:
	TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
	: X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
	}
	};

	// X86 Truncating Masked Store with Signed saturation.
	class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
	public:
	MaskedTruncSStoreSDNode(unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTORES;
	}
	};

	// X86 Truncating Masked Store with Unsigned saturation.
	class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
	public:
	MaskedTruncUSStoreSDNode(unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
	}
	};

	// X86 specific Gather/Scatter nodes.
	// The class has the same order of operands as MaskedGatherScatterSDNode for
	// convenience.
	class X86MaskedGatherScatterSDNode : public MemSDNode {
	public:
	X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}

	const SDValue &getBasePtr() const { return getOperand(3); }
	const SDValue &getIndex() const { return getOperand(4); }
	const SDValue &getMask() const { return getOperand(2); }
	const SDValue &getScale() const { return getOperand(5); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MGATHER \|\|
	N->getOpcode() == X86ISD::MSCATTER;
	}
	};

	class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
	public:
	X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
	EVT MemVT, MachineMemOperand *MMO)
	: X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
	MMO) {}

	const SDValue &getPassThru() const { return getOperand(1); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MGATHER;
	}
	};

	class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
	public:
	X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
	EVT MemVT, MachineMemOperand *MMO)
	: X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
	MMO) {}

	const SDValue &getValue() const { return getOperand(1); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MSCATTER;
	}
	};

	/// Generate unpacklo/unpackhi shuffle mask.
	template <typename T = int>
	void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();
	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Helper function to scale a shuffle or target shuffle mask, replacing each
	/// mask index with the scaled sequential indices for an equivalent narrowed
	/// mask. This is the reverse process to canWidenShuffleElements, but can
	/// always succeed.
	template <typename T>
	void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
	SmallVectorImpl<T> &ScaledMask) {
	assert(0 < Scale && "Unexpected scaling factor");
	size_t NumElts = Mask.size();
	ScaledMask.assign(NumElts * Scale, -1);

	for (size_t i = 0; i != NumElts; ++i) {
	int M = Mask[i];

	// Repeat sentinel values in every mask element.
	if (M < 0) {
	for (size_t s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = M;
	continue;
	}

	// Scale mask element and increment across each mask element.
	for (size_t s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = (Scale * M) + s;
	}
	}
	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp (revision 362609)
	@@ -0,0 +1,364 @@
	+//==- X86IndirectThunks.cpp - Construct indirect call/jump thunks for x86 --=//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+/// \file
	+///
	+/// Pass that injects an MI thunk that is used to lower indirect calls in a way
	+/// that prevents speculation on some x86 processors and can be used to mitigate
	+/// security vulnerabilities due to targeted speculative execution and side
	+/// channels such as CVE-2017-5715.
	+///
	+/// Currently supported thunks include:
	+/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
	+/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
	+/// before making an indirect call/jump
	+///
	+/// Note that the reason that this is implemented as a MachineFunctionPass and
	+/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
	+/// serialize all transformations, which can consume lots of memory.
	+///
	+/// TODO(chandlerc): All of this code could use better comments and
	+/// documentation.
	+///
	+//===----------------------------------------------------------------------===//
	+
	+#include "X86.h"
	+#include "X86InstrBuilder.h"
	+#include "X86Subtarget.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineInstrBuilder.h"
	+#include "llvm/CodeGen/MachineModuleInfo.h"
	+#include "llvm/CodeGen/Passes.h"
	+#include "llvm/CodeGen/TargetPassConfig.h"
	+#include "llvm/IR/IRBuilder.h"
	+#include "llvm/IR/Instructions.h"
	+#include "llvm/IR/Module.h"
	+#include "llvm/Support/CommandLine.h"
	+#include "llvm/Support/Debug.h"
	+#include "llvm/Support/raw_ostream.h"
	+
	+using namespace llvm;
	+
	+#define DEBUG_TYPE "x86-retpoline-thunks"
	+
	+static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
	+static const char R11RetpolineName[] = "__llvm_retpoline_r11";
	+static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
	+static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
	+static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
	+static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
	+
	+static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
	+static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
	+
	+namespace {
	+template <typename Derived> class ThunkInserter {
	+ Derived &getDerived() { return static_cast<Derived >(this); }
	+
	+protected:
	+ bool InsertedThunks;
	+ void doInitialization(Module &M) {}
	+ void createThunkFunction(MachineModuleInfo &MMI, StringRef Name);
	+
	+public:
	+ void init(Module &M) {
	+ InsertedThunks = false;
	+ getDerived().doInitialization(M);
	+ }
	+ // return `true` if `MMI` or `MF` was modified
	+ bool run(MachineModuleInfo &MMI, MachineFunction &MF);
	+};
	+
	+struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
	+ const char *getThunkPrefix() { return RetpolineNamePrefix; }
	+ bool mayUseThunk(const MachineFunction &MF) {
	+ const auto &STI = MF.getSubtarget<X86Subtarget>();
	+ return (STI.useRetpolineIndirectCalls() \|\|
	+ STI.useRetpolineIndirectBranches()) &&
	+ !STI.useRetpolineExternalThunk();
	+ }
	+ void insertThunks(MachineModuleInfo &MMI);
	+ void populateThunk(MachineFunction &MF);
	+};
	+
	+struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
	+ const char *getThunkPrefix() { return LVIThunkNamePrefix; }
	+ bool mayUseThunk(const MachineFunction &MF) {
	+ return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
	+ }
	+ void insertThunks(MachineModuleInfo &MMI) {
	+ createThunkFunction(MMI, R11LVIThunkName);
	+ }
	+ void populateThunk(MachineFunction &MF) {
	+ // Grab the entry MBB and erase any other blocks. O0 codegen appears to
	+ // generate two bbs for the entry block.
	+ MachineBasicBlock *Entry = &MF.front();
	+ Entry->clear();
	+ while (MF.size() > 1)
	+ MF.erase(std::next(MF.begin()));
	+
	+ // This code mitigates LVI by replacing each indirect call/jump with a
	+ // direct call/jump to a thunk that looks like:
	+ // ```
	+ // lfence
	+ // jmpq *%r11
	+ // ```
	+ // This ensures that if the value in register %r11 was loaded from memory,
	+ // then the value in %r11 is (architecturally) correct prior to the jump.
	+ const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
	+ BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
	+ BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
	+ MF.front().addLiveIn(X86::R11);
	+ return;
	+ }
	+};
	+
	+class X86IndirectThunks : public MachineFunctionPass {
	+public:
	+ static char ID;
	+
	+ X86IndirectThunks() : MachineFunctionPass(ID) {}
	+
	+ StringRef getPassName() const override { return "X86 Indirect Thunks"; }
	+
	+ bool doInitialization(Module &M) override;
	+ bool runOnMachineFunction(MachineFunction &MF) override;
	+
	+ void getAnalysisUsage(AnalysisUsage &AU) const override {
	+ MachineFunctionPass::getAnalysisUsage(AU);
	+ AU.addRequired<MachineModuleInfoWrapperPass>();
	+ AU.addPreserved<MachineModuleInfoWrapperPass>();
	+ }
	+
	+private:
	+ std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
	+
	+ // FIXME: When LLVM moves to C++17, these can become folds
	+ template <typename... ThunkInserterT>
	+ static void initTIs(Module &M,
	+ std::tuple<ThunkInserterT...> &ThunkInserters) {
	+ (void)std::initializer_list<int>{
	+ (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
	+ }
	+ template <typename... ThunkInserterT>
	+ static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
	+ std::tuple<ThunkInserterT...> &ThunkInserters) {
	+ bool Modified = false;
	+ (void)std::initializer_list<int>{
	+ Modified \|= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
	+ return Modified;
	+ }
	+};
	+
	+} // end anonymous namespace
	+
	+void RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI) {
	+ if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64)
	+ createThunkFunction(MMI, R11RetpolineName);
	+ else
	+ for (StringRef Name : {EAXRetpolineName, ECXRetpolineName, EDXRetpolineName,
	+ EDIRetpolineName})
	+ createThunkFunction(MMI, Name);
	+}
	+
	+void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
	+ bool Is64Bit = MF.getTarget().getTargetTriple().getArch() == Triple::x86_64;
	+ Register ThunkReg;
	+ if (Is64Bit) {
	+ assert(MF.getName() == "__llvm_retpoline_r11" &&
	+ "Should only have an r11 thunk on 64-bit targets");
	+
	+ // __llvm_retpoline_r11:
	+ // callq .Lr11_call_target
	+ // .Lr11_capture_spec:
	+ // pause
	+ // lfence
	+ // jmp .Lr11_capture_spec
	+ // .align 16
	+ // .Lr11_call_target:
	+ // movq %r11, (%rsp)
	+ // retq
	+ ThunkReg = X86::R11;
	+ } else {
	+ // For 32-bit targets we need to emit a collection of thunks for various
	+ // possible scratch registers as well as a fallback that uses EDI, which is
	+ // normally callee saved.
	+ // __llvm_retpoline_eax:
	+ // calll .Leax_call_target
	+ // .Leax_capture_spec:
	+ // pause
	+ // jmp .Leax_capture_spec
	+ // .align 16
	+ // .Leax_call_target:
	+ // movl %eax, (%esp) # Clobber return addr
	+ // retl
	+ //
	+ // __llvm_retpoline_ecx:
	+ // ... # Same setup
	+ // movl %ecx, (%esp)
	+ // retl
	+ //
	+ // __llvm_retpoline_edx:
	+ // ... # Same setup
	+ // movl %edx, (%esp)
	+ // retl
	+ //
	+ // __llvm_retpoline_edi:
	+ // ... # Same setup
	+ // movl %edi, (%esp)
	+ // retl
	+ if (MF.getName() == EAXRetpolineName)
	+ ThunkReg = X86::EAX;
	+ else if (MF.getName() == ECXRetpolineName)
	+ ThunkReg = X86::ECX;
	+ else if (MF.getName() == EDXRetpolineName)
	+ ThunkReg = X86::EDX;
	+ else if (MF.getName() == EDIRetpolineName)
	+ ThunkReg = X86::EDI;
	+ else
	+ llvm_unreachable("Invalid thunk name on x86-32!");
	+ }
	+
	+ const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
	+ // Grab the entry MBB and erase any other blocks. O0 codegen appears to
	+ // generate two bbs for the entry block.
	+ MachineBasicBlock *Entry = &MF.front();
	+ Entry->clear();
	+ while (MF.size() > 1)
	+ MF.erase(std::next(MF.begin()));
	+
	+ MachineBasicBlock *CaptureSpec =
	+ MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	+ MachineBasicBlock *CallTarget =
	+ MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	+ MCSymbol *TargetSym = MF.getContext().createTempSymbol();
	+ MF.push_back(CaptureSpec);
	+ MF.push_back(CallTarget);
	+
	+ const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
	+ const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
	+
	+ Entry->addLiveIn(ThunkReg);
	+ BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
	+
	+ // The MIR verifier thinks that the CALL in the entry block will fall through
	+ // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
	+ // the successor, but the MIR verifier doesn't know how to cope with that.
	+ Entry->addSuccessor(CaptureSpec);
	+
	+ // In the capture loop for speculation, we want to stop the processor from
	+ // speculating as fast as possible. On Intel processors, the PAUSE instruction
	+ // will block speculation without consuming any execution resources. On AMD
	+ // processors, the PAUSE instruction is (essentially) a nop, so we also use an
	+ // LFENCE instruction which they have advised will stop speculation as well
	+ // with minimal resource utilization. We still end the capture with a jump to
	+ // form an infinite loop to fully guarantee that no matter what implementation
	+ // of the x86 ISA, speculating this code path never escapes.
	+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
	+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
	+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
	+ CaptureSpec->setHasAddressTaken();
	+ CaptureSpec->addSuccessor(CaptureSpec);
	+
	+ CallTarget->addLiveIn(ThunkReg);
	+ CallTarget->setHasAddressTaken();
	+ CallTarget->setAlignment(Align(16));
	+
	+ // Insert return address clobber
	+ const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
	+ const Register SPReg = Is64Bit ? X86::RSP : X86::ESP;
	+ addRegOffset(BuildMI(CallTarget, DebugLoc(), TII->get(MovOpc)), SPReg, false,
	+ 0)
	+ .addReg(ThunkReg);
	+
	+ CallTarget->back().setPreInstrSymbol(MF, TargetSym);
	+ BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
	+}
	+
	+template <typename Derived>
	+void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI,
	+ StringRef Name) {
	+ assert(Name.startswith(getDerived().getThunkPrefix()) &&
	+ "Created a thunk with an unexpected prefix!");
	+
	+ Module &M = const_cast<Module &>(*MMI.getModule());
	+ LLVMContext &Ctx = M.getContext();
	+ auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
	+ Function *F =
	+ Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
	+ F->setVisibility(GlobalValue::HiddenVisibility);
	+ F->setComdat(M.getOrInsertComdat(Name));
	+
	+ // Add Attributes so that we don't create a frame, unwind information, or
	+ // inline.
	+ AttrBuilder B;
	+ B.addAttribute(llvm::Attribute::NoUnwind);
	+ B.addAttribute(llvm::Attribute::Naked);
	+ F->addAttributes(llvm::AttributeList::FunctionIndex, B);
	+
	+ // Populate our function a bit so that we can verify.
	+ BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
	+ IRBuilder<> Builder(Entry);
	+
	+ Builder.CreateRetVoid();
	+
	+ // MachineFunctions/MachineBasicBlocks aren't created automatically for the
	+ // IR-level constructs we already made. Create them and insert them into the
	+ // module.
	+ MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
	+ MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
	+
	+ // Insert EntryMBB into MF. It's not in the module until we do this.
	+ MF.insert(MF.end(), EntryMBB);
	+ // Set MF properties. We never use vregs...
	+ MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
	+}
	+
	+template <typename Derived>
	+bool ThunkInserter<Derived>::run(MachineModuleInfo &MMI, MachineFunction &MF) {
	+ // If MF is not a thunk, check to see if we need to insert a thunk.
	+ if (!MF.getName().startswith(getDerived().getThunkPrefix())) {
	+ // If we've already inserted a thunk, nothing else to do.
	+ if (InsertedThunks)
	+ return false;
	+
	+ // Only add a thunk if one of the functions has the corresponding feature
	+ // enabled in its subtarget, and doesn't enable external thunks.
	+ // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
	+ // nothing will end up calling it.
	+ // FIXME: It's a little silly to look at every function just to enumerate
	+ // the subtargets, but eventually we'll want to look at them for indirect
	+ // calls, so maybe this is OK.
	+ if (!getDerived().mayUseThunk(MF))
	+ return false;
	+
	+ getDerived().insertThunks(MMI);
	+ InsertedThunks = true;
	+ return true;
	+ }
	+
	+ // If this is a thunk function, we need to populate it with the correct MI.
	+ getDerived().populateThunk(MF);
	+ return true;
	+}
	+
	+FunctionPass *llvm::createX86IndirectThunksPass() {
	+ return new X86IndirectThunks();
	+}
	+
	+char X86IndirectThunks::ID = 0;
	+
	+bool X86IndirectThunks::doInitialization(Module &M) {
	+ initTIs(M, TIs);
	+ return false;
	+}
	+
	+bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
	+ LLVM_DEBUG(dbgs() << getPassName() << '\n');
	+ auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
	+ return runTIs(MMI, MF, TIs);
	+}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td (revision 362609)
	@@ -1,2177 +1,2177 @@
	//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the various pseudo instructions used by the compiler,
	// as well as Pat patterns used during instruction selection.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Pattern Matching Support

	def GetLo32XForm : SDNodeXForm<imm, [{
	// Transformation function: get the low 32 bits.
	return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
	}]>;


	//===----------------------------------------------------------------------===//
	// Random Pseudo Instructions.

	// PIC base construction. This expands to code that looks like this:
	// call $next_inst
	// popl %destreg"
	let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
	SchedRW = [WriteJump] in
	def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
	"", []>;

	// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
	// a stack adjustment and the codegen must know that they may modify the stack
	// pointer before prolog-epilog rewriting occurs.
	// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
	// sub / add which can clobber EFLAGS.
	let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in {
	def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
	(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
	"#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>;
	def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
	"#ADJCALLSTACKUP",
	[(X86callseq_end timm:$amt1, timm:$amt2)]>,
	Requires<[NotLP64]>;
	}
	def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
	(ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;


	// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
	// a stack adjustment and the codegen must know that they may modify the stack
	// pointer before prolog-epilog rewriting occurs.
	// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
	// sub / add which can clobber EFLAGS.
	let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in {
	def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
	(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
	"#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>;
	def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
	"#ADJCALLSTACKUP",
	[(X86callseq_end timm:$amt1, timm:$amt2)]>,
	Requires<[IsLP64]>;
	}
	def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
	(ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;

	let SchedRW = [WriteSystem] in {

	// x86-64 va_start lowering magic.
	let usesCustomInserter = 1, Defs = [EFLAGS] in {
	def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
	(outs),
	(ins GR8:$al,
	i64imm:$regsavefi, i64imm:$offset,
	variable_ops),
	"#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
	[(X86vastart_save_xmm_regs GR8:$al,
	imm:$regsavefi,
	imm:$offset),
	(implicit EFLAGS)]>;

	// The VAARG_64 pseudo-instruction takes the address of the va_list,
	// and places the address of the next argument into a register.
	let Defs = [EFLAGS] in
	def VAARG_64 : I<0, Pseudo,
	(outs GR64:$dst),
	(ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
	"#VAARG_64 $dst, $ap, $size, $mode, $align",
	[(set GR64:$dst,
	(X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
	(implicit EFLAGS)]>;


	// When using segmented stacks these are lowered into instructions which first
	// check if the current stacklet has enough free memory. If it does, memory is
	// allocated by bumping the stack pointer. Otherwise memory is allocated from
	// the heap.

	let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
	def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
	"# variable sized alloca for segmented stacks",
	[(set GR32:$dst,
	(X86SegAlloca GR32:$size))]>,
	Requires<[NotLP64]>;

	let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
	def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
	"# variable sized alloca for segmented stacks",
	[(set GR64:$dst,
	(X86SegAlloca GR64:$size))]>,
	Requires<[In64BitMode]>;
	}

	// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
	// targets. These calls are needed to probe the stack when allocating more than
	// 4k bytes in one go. Touching the stack at 4K increments is necessary to
	// ensure that the guard pages used by the OS virtual memory manager are
	// allocated in correct sequence.
	// The main point of having separate instruction are extra unmodelled effects
	// (compared to ordinary calls) like stack pointer change.

	let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
	def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
	"# dynamic stack allocation",
	[(X86WinAlloca GR32:$size)]>,
	Requires<[NotLP64]>;

	let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
	def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
	"# dynamic stack allocation",
	[(X86WinAlloca GR64:$size)]>,
	Requires<[In64BitMode]>;
	} // SchedRW

	// These instructions XOR the frame pointer into a GPR. They are used in some
	// stack protection schemes. These are post-RA pseudos because we only know the
	// frame register after register allocation.
	let Constraints = "$src = $dst", isMoveImm = 1, isPseudo = 1, Defs = [EFLAGS] in {
	def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
	"xorl\t$$FP, $src", []>,
	Requires<[NotLP64]>, Sched<[WriteALU]>;
	def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src),
	"xorq\t$$FP $src", []>,
	Requires<[In64BitMode]>, Sched<[WriteALU]>;
	}

	//===----------------------------------------------------------------------===//
	// EH Pseudo Instructions
	//
	let SchedRW = [WriteSystem] in {
	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, isCodeGenOnly = 1 in {
	def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
	"ret\t#eh_return, addr: $addr",
	[(X86ehret GR32:$addr)]>, Sched<[WriteJumpLd]>;

	}

	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, isCodeGenOnly = 1 in {
	def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
	"ret\t#eh_return, addr: $addr",
	[(X86ehret GR64:$addr)]>, Sched<[WriteJumpLd]>;

	}

	let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
	isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1 in {
	def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;

	// CATCHRET needs a custom inserter for SEH.
	let usesCustomInserter = 1 in
	def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
	"# CATCHRET",
	[(catchret bb:$dst, bb:$from)]>;
	}

	let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
	usesCustomInserter = 1 in
	def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;

	// This instruction is responsible for re-establishing stack pointers after an
	// exception has been caught and we are rejoining normal control flow in the
	// parent function or funclet. It generally sets ESP and EBP, and optionally
	// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
	// elsewhere.
	let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
	def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;

	let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
	usesCustomInserter = 1 in {
	def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
	"#EH_SJLJ_SETJMP32",
	[(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
	Requires<[Not64BitMode]>;
	def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
	"#EH_SJLJ_SETJMP64",
	[(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
	Requires<[In64BitMode]>;
	let isTerminator = 1 in {
	def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
	"#EH_SJLJ_LONGJMP32",
	[(X86eh_sjlj_longjmp addr:$buf)]>,
	Requires<[Not64BitMode]>;
	def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
	"#EH_SJLJ_LONGJMP64",
	[(X86eh_sjlj_longjmp addr:$buf)]>,
	Requires<[In64BitMode]>;
	}
	}

	let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
	def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
	"#EH_SjLj_Setup\t$dst", []>;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Pseudo instructions used by unwind info.
	//
	let isPseudo = 1, SchedRW = [WriteSystem] in {
	def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
	"#SEH_PushReg $reg", []>;
	def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
	"#SEH_SaveReg $reg, $dst", []>;
	def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
	"#SEH_SaveXMM $reg, $dst", []>;
	def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
	"#SEH_StackAlloc $size", []>;
	def SEH_StackAlign : I<0, Pseudo, (outs), (ins i32imm:$align),
	"#SEH_StackAlign $align", []>;
	def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
	"#SEH_SetFrame $reg, $offset", []>;
	def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
	"#SEH_PushFrame $mode", []>;
	def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
	"#SEH_EndPrologue", []>;
	def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
	"#SEH_Epilogue", []>;
	}

	//===----------------------------------------------------------------------===//
	// Pseudo instructions used by segmented stacks.
	//

	// This is lowered into a RET instruction by MCInstLower. We need
	// this so that we don't have to have a MachineBasicBlock which ends
	// with a RET and also has successors.
	let isPseudo = 1, SchedRW = [WriteJumpLd] in {
	def MORESTACK_RET: I<0, Pseudo, (outs), (ins), "", []>;

	// This instruction is lowered to a RET followed by a MOV. The two
	// instructions are not generated on a higher level since then the
	// verifier sees a MachineBasicBlock ending with a non-terminator.
	def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
	}

	//===----------------------------------------------------------------------===//
	// Alias Instructions
	//===----------------------------------------------------------------------===//

	// Alias instruction mapping movr0 to xor.
	// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
	let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
	isPseudo = 1, isMoveImm = 1, AddedComplexity = 10 in
	def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, 0)]>, Sched<[WriteZero]>;

	// Other widths can also make use of the 32-bit xor, which may have a smaller
	// encoding and avoid partial register updates.
	let AddedComplexity = 10 in {
	def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
	def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
	def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
	}

	let Predicates = [OptForSize, Not64BitMode],
	AddedComplexity = 10 in {
	let SchedRW = [WriteALU] in {
	// Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
	// which only require 3 bytes compared to MOV32ri which requires 5.
	let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
	def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, 1)]>;
	def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, -1)]>;
	}
	} // SchedRW

	// MOV16ri is 4 bytes, so the instructions above are smaller.
	def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
	def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
	}

	let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5,
	SchedRW = [WriteALU] in {
	// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
	def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
	[(set GR32:$dst, i32immSExt8:$src)]>,
	Requires<[OptForMinSize, NotWin64WithoutFP]>;
	def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
	[(set GR64:$dst, i64immSExt8:$src)]>,
	Requires<[OptForMinSize, NotWin64WithoutFP]>;
	}

	// Materialize i64 constant where top 32-bits are zero. This could theoretically
	// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
	// that would make it more difficult to rematerialize.
	let isReMaterializable = 1, isAsCheapAsAMove = 1,
	isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
	def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "", []>;

	// This 64-bit pseudo-move can be used for both a 64-bit constant that is
	// actually the zero-extension of a 32-bit constant and for labels in the
	// x86-64 small code model.
	def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;

	def : Pat<(i64 mov64imm32:$src), (MOV32ri64 mov64imm32:$src)>;

	// Use sbb to materialize carry bit.
	let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
	// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
	// However, Pat<> can't replicate the destination reg into the inputs of the
	// result.
	def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
	[(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
	[(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
	[(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	} // isCodeGenOnly


	def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C16r)>;
	def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;
	def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C64r)>;

	def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C16r)>;
	def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;
	def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C64r)>;

	// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
	// will be eliminated and that the sbb can be extended up to a wider type. When
	// this happens, it is great. However, if we are left with an 8-bit sbb and an
	// and, we might as well just match it as a setb.
	def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
	(SETCCr (i8 2))>;

	// Patterns to give priority when both inputs are zero so that we don't use
	// an immediate for the RHS.
	// TODO: Should we use a 32-bit sbb for 8/16 to push the extract_subreg out?
	def : Pat<(X86sbb_flag (i8 0), (i8 0), EFLAGS),
	(SBB8rr (EXTRACT_SUBREG (MOV32r0), sub_8bit),
	(EXTRACT_SUBREG (MOV32r0), sub_8bit))>;
	def : Pat<(X86sbb_flag (i16 0), (i16 0), EFLAGS),
	(SBB16rr (EXTRACT_SUBREG (MOV32r0), sub_16bit),
	(EXTRACT_SUBREG (MOV32r0), sub_16bit))>;
	def : Pat<(X86sbb_flag (i32 0), (i32 0), EFLAGS),
	(SBB32rr (MOV32r0), (MOV32r0))>;
	def : Pat<(X86sbb_flag (i64 0), (i64 0), EFLAGS),
	(SBB64rr (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit),
	(SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit))>;

	//===----------------------------------------------------------------------===//
	// String Pseudo Instructions
	//
	let SchedRW = [WriteMicrocoded] in {
	let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
	def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins),
	"{rep;movsb (%esi), %es:(%edi)\|rep movsb es:[edi], [esi]}",
	[(X86rep_movs i8)]>, REP, AdSize32,
	Requires<[NotLP64]>;
	def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins),
	"{rep;movsw (%esi), %es:(%edi)\|rep movsw es:[edi], [esi]}",
	[(X86rep_movs i16)]>, REP, AdSize32, OpSize16,
	Requires<[NotLP64]>;
	def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins),
	"{rep;movsl (%esi), %es:(%edi)\|rep movsd es:[edi], [esi]}",
	[(X86rep_movs i32)]>, REP, AdSize32, OpSize32,
	Requires<[NotLP64]>;
	def REP_MOVSQ_32 : RI<0xA5, RawFrm, (outs), (ins),
	"{rep;movsq (%esi), %es:(%edi)\|rep movsq es:[edi], [esi]}",
	[(X86rep_movs i64)]>, REP, AdSize32,
	Requires<[NotLP64, In64BitMode]>;
	}

	let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
	def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins),
	"{rep;movsb (%rsi), %es:(%rdi)\|rep movsb es:[rdi], [rsi]}",
	[(X86rep_movs i8)]>, REP, AdSize64,
	Requires<[IsLP64]>;
	def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins),
	"{rep;movsw (%rsi), %es:(%rdi)\|rep movsw es:[rdi], [rsi]}",
	[(X86rep_movs i16)]>, REP, AdSize64, OpSize16,
	Requires<[IsLP64]>;
	def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins),
	"{rep;movsl (%rsi), %es:(%rdi)\|rep movsdi es:[rdi], [rsi]}",
	[(X86rep_movs i32)]>, REP, AdSize64, OpSize32,
	Requires<[IsLP64]>;
	def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins),
	"{rep;movsq (%rsi), %es:(%rdi)\|rep movsq es:[rdi], [rsi]}",
	[(X86rep_movs i64)]>, REP, AdSize64,
	Requires<[IsLP64]>;
	}

	// FIXME: Should use "(X86rep_stos AL)" as the pattern.
	let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
	let Uses = [AL,ECX,EDI] in
	def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins),
	"{rep;stosb %al, %es:(%edi)\|rep stosb es:[edi], al}",
	[(X86rep_stos i8)]>, REP, AdSize32,
	Requires<[NotLP64]>;
	let Uses = [AX,ECX,EDI] in
	def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins),
	"{rep;stosw %ax, %es:(%edi)\|rep stosw es:[edi], ax}",
	[(X86rep_stos i16)]>, REP, AdSize32, OpSize16,
	Requires<[NotLP64]>;
	let Uses = [EAX,ECX,EDI] in
	def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins),
	"{rep;stosl %eax, %es:(%edi)\|rep stosd es:[edi], eax}",
	[(X86rep_stos i32)]>, REP, AdSize32, OpSize32,
	Requires<[NotLP64]>;
	let Uses = [RAX,RCX,RDI] in
	def REP_STOSQ_32 : RI<0xAB, RawFrm, (outs), (ins),
	"{rep;stosq %rax, %es:(%edi)\|rep stosq es:[edi], rax}",
	[(X86rep_stos i64)]>, REP, AdSize32,
	Requires<[NotLP64, In64BitMode]>;
	}

	let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
	let Uses = [AL,RCX,RDI] in
	def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins),
	"{rep;stosb %al, %es:(%rdi)\|rep stosb es:[rdi], al}",
	[(X86rep_stos i8)]>, REP, AdSize64,
	Requires<[IsLP64]>;
	let Uses = [AX,RCX,RDI] in
	def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins),
	"{rep;stosw %ax, %es:(%rdi)\|rep stosw es:[rdi], ax}",
	[(X86rep_stos i16)]>, REP, AdSize64, OpSize16,
	Requires<[IsLP64]>;
	let Uses = [RAX,RCX,RDI] in
	def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins),
	"{rep;stosl %eax, %es:(%rdi)\|rep stosd es:[rdi], eax}",
	[(X86rep_stos i32)]>, REP, AdSize64, OpSize32,
	Requires<[IsLP64]>;

	let Uses = [RAX,RCX,RDI] in
	def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins),
	"{rep;stosq %rax, %es:(%rdi)\|rep stosq es:[rdi], rax}",
	[(X86rep_stos i64)]>, REP, AdSize64,
	Requires<[IsLP64]>;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Thread Local Storage Instructions
	//
	let SchedRW = [WriteSystem] in {

	// ELF TLS Support
	// All calls clobber the non-callee saved registers. ESP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead.
	let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
	ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
	MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
	usesCustomInserter = 1, Uses = [ESP, SSP] in {
	def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
	"# TLS_addr32",
	[(X86tlsaddr tls32addr:$sym)]>,
	Requires<[Not64BitMode]>;
	def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
	"# TLS_base_addr32",
	[(X86tlsbaseaddr tls32baseaddr:$sym)]>,
	Requires<[Not64BitMode]>;
	}

	// All calls clobber the non-callee saved registers. RSP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead.
	let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
	FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
	ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
	MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
	usesCustomInserter = 1, Uses = [RSP, SSP] in {
	def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	"# TLS_addr64",
	[(X86tlsaddr tls64addr:$sym)]>,
	Requires<[In64BitMode]>;
	def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	"# TLS_base_addr64",
	[(X86tlsbaseaddr tls64baseaddr:$sym)]>,
	Requires<[In64BitMode]>;
	}

	// Darwin TLS Support
	// For i386, the address of the thunk is passed on the stack, on return the
	// address of the variable is in %eax. %ecx is trashed during the function
	// call. All other registers are preserved.
	let Defs = [EAX, ECX, EFLAGS, DF],
	Uses = [ESP, SSP],
	usesCustomInserter = 1 in
	def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
	"# TLSCall_32",
	[(X86TLSCall addr:$sym)]>,
	Requires<[Not64BitMode]>;

	// For x86_64, the address of the thunk is passed in %rdi, but the
	// pseudo directly use the symbol, so do not add an implicit use of
	// %rdi. The lowering will do the right thing with RDI.
	// On return the address of the variable is in %rax. All other
	// registers are preserved.
	let Defs = [RAX, EFLAGS, DF],
	Uses = [RSP, SSP],
	usesCustomInserter = 1 in
	def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	"# TLSCall_64",
	[(X86TLSCall addr:$sym)]>,
	Requires<[In64BitMode]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Conditional Move Pseudo Instructions

	// CMOV* - Used to implement the SELECT DAG operation. Expanded after
	// instruction selection into a branch sequence.
	multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
	def CMOV#NAME : I<0, Pseudo,
	(outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
	"#CMOV_"#NAME#" PSEUDO!",
	[(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond,
	EFLAGS)))]>;
	}

	let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
	// X86 doesn't have 8-bit conditional moves. Use a customInserter to
	// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
	// however that requires promoting the operands, and can induce additional
	// i8 register pressure.
	defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;

	let Predicates = [NoCMov] in {
	defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
	defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
	} // Predicates = [NoCMov]

	// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
	// SSE1/SSE2.
	let Predicates = [FPStackf32] in
	defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;

	let Predicates = [FPStackf64] in
	defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;

	defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;

	let Predicates = [NoAVX512] in {
	defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
	defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
	}
	let Predicates = [HasAVX512] in {
	defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
	defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
	}
	let Predicates = [NoVLX] in {
	defm _VR128 : CMOVrr_PSEUDO<VR128, v2i64>;
	defm _VR256 : CMOVrr_PSEUDO<VR256, v4i64>;
	}
	let Predicates = [HasVLX] in {
	defm _VR128X : CMOVrr_PSEUDO<VR128X, v2i64>;
	defm _VR256X : CMOVrr_PSEUDO<VR256X, v4i64>;
	}
	defm _VR512 : CMOVrr_PSEUDO<VR512, v8i64>;
	defm _VK2 : CMOVrr_PSEUDO<VK2, v2i1>;
	defm _VK4 : CMOVrr_PSEUDO<VK4, v4i1>;
	defm _VK8 : CMOVrr_PSEUDO<VK8, v8i1>;
	defm _VK16 : CMOVrr_PSEUDO<VK16, v16i1>;
	defm _VK32 : CMOVrr_PSEUDO<VK32, v32i1>;
	defm _VK64 : CMOVrr_PSEUDO<VK64, v64i1>;
	} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]

	def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;

	let Predicates = [NoVLX] in {
	def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
	def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
	def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
	def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
	def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;

	def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
	def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
	def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
	def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
	def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
	}
	let Predicates = [HasVLX] in {
	def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
	def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
	def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
	def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
	def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;

	def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
	def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
	def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
	def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
	def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
	(CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
	}

	def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
	(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
	def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
	(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
	def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
	(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
	def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
	(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
	def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
	(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;

	//===----------------------------------------------------------------------===//
	// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
	//===----------------------------------------------------------------------===//

	// FIXME: Use normal instructions and add lock prefix dynamically.

	// Memory barriers

	let isCodeGenOnly = 1, Defs = [EFLAGS] in
	def OR32mi8Locked : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$zero),
	"or{l}\t{$zero, $dst\|$dst, $zero}", []>,
	Requires<[Not64BitMode]>, OpSize32, LOCK,
	Sched<[WriteALURMW]>;

	let hasSideEffects = 1 in
	def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
	"#MEMBARRIER",
	[(X86MemBarrier)]>, Sched<[WriteLoad]>;

	// RegOpc corresponds to the mr version of the instruction
	// ImmOpc corresponds to the mi version of the instruction
	// ImmOpc8 corresponds to the mi8 version of the instruction
	// ImmMod corresponds to the instruction format of the mi and mi8 versions
	multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
	Format ImmMod, SDNode Op, string mnemonic> {
	let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
	SchedRW = [WriteALURMW] in {

	def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
	MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
	!strconcat(mnemonic, "{b}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR8:$src2))]>, LOCK;

	def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
	MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
	!strconcat(mnemonic, "{w}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR16:$src2))]>,
	OpSize16, LOCK;

	def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
	MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
	!strconcat(mnemonic, "{l}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR32:$src2))]>,
	OpSize32, LOCK;

	def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
	MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
	!strconcat(mnemonic, "{q}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK;

	// NOTE: These are order specific, we want the mi8 forms to be listed
	// first so that they are slightly preferred to the mi forms.
	def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
	ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
	ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
	!strconcat(mnemonic, "{w}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
	OpSize16, LOCK;

	def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
	ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
	ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
	!strconcat(mnemonic, "{l}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
	OpSize32, LOCK;

	def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
	ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
	ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
	!strconcat(mnemonic, "{q}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
	LOCK;

	def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
	ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
	!strconcat(mnemonic, "{b}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))]>, LOCK;

	def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
	ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
	!strconcat(mnemonic, "{w}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))]>,
	OpSize16, LOCK;

	def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
	ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
	!strconcat(mnemonic, "{l}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))]>,
	OpSize32, LOCK;

	def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
	ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
	!strconcat(mnemonic, "{q}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>,
	LOCK;
	}

	}

	defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">;
	defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">;
	defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">;
	defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
	defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;

	def X86lock_add_nocf : PatFrag<(ops node:$lhs, node:$rhs),
	(X86lock_add node:$lhs, node:$rhs), [{
	return hasNoCarryFlagUses(SDValue(N, 0));
	}]>;

	def X86lock_sub_nocf : PatFrag<(ops node:$lhs, node:$rhs),
	(X86lock_sub node:$lhs, node:$rhs), [{
	return hasNoCarryFlagUses(SDValue(N, 0));
	}]>;

	let Predicates = [UseIncDec] in {
	let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
	SchedRW = [WriteALURMW] in {
	def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
	"inc{b}\t$dst",
	[(set EFLAGS, (X86lock_add_nocf addr:$dst, (i8 1)))]>,
	LOCK;
	def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
	"inc{w}\t$dst",
	[(set EFLAGS, (X86lock_add_nocf addr:$dst, (i16 1)))]>,
	OpSize16, LOCK;
	def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
	"inc{l}\t$dst",
	[(set EFLAGS, (X86lock_add_nocf addr:$dst, (i32 1)))]>,
	OpSize32, LOCK;
	def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
	"inc{q}\t$dst",
	[(set EFLAGS, (X86lock_add_nocf addr:$dst, (i64 1)))]>,
	LOCK;

	def LOCK_DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
	"dec{b}\t$dst",
	[(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i8 1)))]>,
	LOCK;
	def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
	"dec{w}\t$dst",
	[(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i16 1)))]>,
	OpSize16, LOCK;
	def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
	"dec{l}\t$dst",
	[(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i32 1)))]>,
	OpSize32, LOCK;
	def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
	"dec{q}\t$dst",
	[(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i64 1)))]>,
	LOCK;
	}

	// Additional patterns for -1 constant.
	def : Pat<(X86lock_add addr:$dst, (i8 -1)), (LOCK_DEC8m addr:$dst)>;
	def : Pat<(X86lock_add addr:$dst, (i16 -1)), (LOCK_DEC16m addr:$dst)>;
	def : Pat<(X86lock_add addr:$dst, (i32 -1)), (LOCK_DEC32m addr:$dst)>;
	def : Pat<(X86lock_add addr:$dst, (i64 -1)), (LOCK_DEC64m addr:$dst)>;
	def : Pat<(X86lock_sub addr:$dst, (i8 -1)), (LOCK_INC8m addr:$dst)>;
	def : Pat<(X86lock_sub addr:$dst, (i16 -1)), (LOCK_INC16m addr:$dst)>;
	def : Pat<(X86lock_sub addr:$dst, (i32 -1)), (LOCK_INC32m addr:$dst)>;
	def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>;
	}

	// Atomic compare and swap.
	multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
	SDPatternOperator frag, X86MemOperand x86memop> {
	let isCodeGenOnly = 1, usesCustomInserter = 1 in {
	def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
	!strconcat(mnemonic, "\t$ptr"),
	[(frag addr:$ptr)]>, TB, LOCK;
	}
	}

	multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
	string mnemonic, SDPatternOperator frag> {
	let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
	let Defs = [AL, EFLAGS], Uses = [AL] in
	def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
	!strconcat(mnemonic, "{b}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
	let Defs = [AX, EFLAGS], Uses = [AX] in
	def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
	!strconcat(mnemonic, "{w}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR16:$swap, 2)]>, TB, OpSize16, LOCK;
	let Defs = [EAX, EFLAGS], Uses = [EAX] in
	def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
	!strconcat(mnemonic, "{l}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR32:$swap, 4)]>, TB, OpSize32, LOCK;
	let Defs = [RAX, EFLAGS], Uses = [RAX] in
	def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
	!strconcat(mnemonic, "{q}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
	}
	}

	let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
	Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in {
	defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
	}

	// This pseudo must be used when the frame uses RBX as
	// the base pointer. Indeed, in such situation RBX is a reserved
	// register and the register allocator will ignore any use/def of
	// it. In other words, the register will not fix the clobbering of
	// RBX that will happen when setting the arguments for the instrucion.
	//
	// Unlike the actual related instuction, we mark that this one
	// defines EBX (instead of using EBX).
	// The rationale is that we will define RBX during the expansion of
	// the pseudo. The argument feeding EBX is ebx_input.
	//
	// The additional argument, $ebx_save, is a temporary register used to
	// save the value of RBX across the actual instruction.
	//
	// To make sure the register assigned to $ebx_save does not interfere with
	// the definition of the actual instruction, we use a definition $dst which
	// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
	// the instruction and we are sure we will have a valid register to restore
	// the value of RBX.
	let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
	Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
	isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst",
	usesCustomInserter = 1 in {
	def LCMPXCHG8B_SAVE_EBX :
	I<0, Pseudo, (outs GR32:$dst),
	(ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
	!strconcat("cmpxchg8b", "\t$ptr"),
	[(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
	GR32:$ebx_save))]>;
	}


	let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
	Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in {
	defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
	X86cas16, i128mem>, REX_W;
	}

	// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
	let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
	Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
	isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
	usesCustomInserter = 1 in {
	def LCMPXCHG16B_SAVE_RBX :
	I<0, Pseudo, (outs GR64:$dst),
	(ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
	!strconcat("cmpxchg16b", "\t$ptr"),
	[(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
	GR64:$rbx_save))]>;
	}

	defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;

	// Atomic exchange and add
	multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
	string frag> {
	let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
	SchedRW = [WriteALURMW] in {
	def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
	(ins GR8:$val, i8mem:$ptr),
	!strconcat(mnemonic, "{b}\t{$val, $ptr\|$ptr, $val}"),
	[(set GR8:$dst,
	(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
	def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
	(ins GR16:$val, i16mem:$ptr),
	!strconcat(mnemonic, "{w}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR16:$dst,
	(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
	OpSize16;
	def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
	(ins GR32:$val, i32mem:$ptr),
	!strconcat(mnemonic, "{l}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR32:$dst,
	(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
	OpSize32;
	def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
	(ins GR64:$val, i64mem:$ptr),
	!strconcat(mnemonic, "{q}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR64:$dst,
	(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
	}
	}

	defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK;

	/* The following multiclass tries to make sure that in code like
	* x.store (immediate op x.load(acquire), release)
	* and
	* x.store (register op x.load(acquire), release)
	* an operation directly on memory is generated instead of wasting a register.
	* It is not automatic as atomic_store/load are only lowered to MOV instructions
	* extremely late to prevent them from being accidentally reordered in the backend
	* (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
	*/
	multiclass RELEASE_BINOP_MI<string Name, SDNode op> {
	def : Pat<(atomic_store_8 addr:$dst,
	(op (atomic_load_8 addr:$dst), (i8 imm:$src))),
	(!cast<Instruction>(Name#"8mi") addr:$dst, imm:$src)>;
	def : Pat<(atomic_store_16 addr:$dst,
	(op (atomic_load_16 addr:$dst), (i16 imm:$src))),
	(!cast<Instruction>(Name#"16mi") addr:$dst, imm:$src)>;
	def : Pat<(atomic_store_32 addr:$dst,
	(op (atomic_load_32 addr:$dst), (i32 imm:$src))),
	(!cast<Instruction>(Name#"32mi") addr:$dst, imm:$src)>;
	def : Pat<(atomic_store_64 addr:$dst,
	(op (atomic_load_64 addr:$dst), (i64immSExt32:$src))),
	(!cast<Instruction>(Name#"64mi32") addr:$dst, (i64immSExt32:$src))>;

	def : Pat<(atomic_store_8 addr:$dst,
	(op (atomic_load_8 addr:$dst), (i8 GR8:$src))),
	(!cast<Instruction>(Name#"8mr") addr:$dst, GR8:$src)>;
	def : Pat<(atomic_store_16 addr:$dst,
	(op (atomic_load_16 addr:$dst), (i16 GR16:$src))),
	(!cast<Instruction>(Name#"16mr") addr:$dst, GR16:$src)>;
	def : Pat<(atomic_store_32 addr:$dst,
	(op (atomic_load_32 addr:$dst), (i32 GR32:$src))),
	(!cast<Instruction>(Name#"32mr") addr:$dst, GR32:$src)>;
	def : Pat<(atomic_store_64 addr:$dst,
	(op (atomic_load_64 addr:$dst), (i64 GR64:$src))),
	(!cast<Instruction>(Name#"64mr") addr:$dst, GR64:$src)>;
	}
	defm : RELEASE_BINOP_MI<"ADD", add>;
	defm : RELEASE_BINOP_MI<"AND", and>;
	defm : RELEASE_BINOP_MI<"OR", or>;
	defm : RELEASE_BINOP_MI<"XOR", xor>;
	defm : RELEASE_BINOP_MI<"SUB", sub>;

	// Atomic load + floating point patterns.
	// FIXME: This could also handle SIMD operations with ps and pd instructions.
	multiclass ATOMIC_LOAD_FP_BINOP_MI<string Name, SDNode op> {
	def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
	(!cast<Instruction>(Name#"SSrm") FR32:$src1, addr:$src2)>,
	Requires<[UseSSE1]>;
	def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
	(!cast<Instruction>("V"#Name#"SSrm") FR32:$src1, addr:$src2)>,
	Requires<[UseAVX]>;
	def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
	(!cast<Instruction>("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>,
	Requires<[HasAVX512]>;

	def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
	(!cast<Instruction>(Name#"SDrm") FR64:$src1, addr:$src2)>,
	Requires<[UseSSE1]>;
	def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
	(!cast<Instruction>("V"#Name#"SDrm") FR64:$src1, addr:$src2)>,
	Requires<[UseAVX]>;
	def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
	(!cast<Instruction>("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>,
	Requires<[HasAVX512]>;
	}
	defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>;
	// FIXME: Add fsub, fmul, fdiv, ...

	multiclass RELEASE_UNOP<string Name, dag dag8, dag dag16, dag dag32,
	dag dag64> {
	def : Pat<(atomic_store_8 addr:$dst, dag8),
	(!cast<Instruction>(Name#8m) addr:$dst)>;
	def : Pat<(atomic_store_16 addr:$dst, dag16),
	(!cast<Instruction>(Name#16m) addr:$dst)>;
	def : Pat<(atomic_store_32 addr:$dst, dag32),
	(!cast<Instruction>(Name#32m) addr:$dst)>;
	def : Pat<(atomic_store_64 addr:$dst, dag64),
	(!cast<Instruction>(Name#64m) addr:$dst)>;
	}

	let Predicates = [UseIncDec] in {
	defm : RELEASE_UNOP<"INC",
	(add (atomic_load_8 addr:$dst), (i8 1)),
	(add (atomic_load_16 addr:$dst), (i16 1)),
	(add (atomic_load_32 addr:$dst), (i32 1)),
	(add (atomic_load_64 addr:$dst), (i64 1))>;
	defm : RELEASE_UNOP<"DEC",
	(add (atomic_load_8 addr:$dst), (i8 -1)),
	(add (atomic_load_16 addr:$dst), (i16 -1)),
	(add (atomic_load_32 addr:$dst), (i32 -1)),
	(add (atomic_load_64 addr:$dst), (i64 -1))>;
	}

	defm : RELEASE_UNOP<"NEG",
	(ineg (i8 (atomic_load_8 addr:$dst))),
	(ineg (i16 (atomic_load_16 addr:$dst))),
	(ineg (i32 (atomic_load_32 addr:$dst))),
	(ineg (i64 (atomic_load_64 addr:$dst)))>;
	defm : RELEASE_UNOP<"NOT",
	(not (i8 (atomic_load_8 addr:$dst))),
	(not (i16 (atomic_load_16 addr:$dst))),
	(not (i32 (atomic_load_32 addr:$dst))),
	(not (i64 (atomic_load_64 addr:$dst)))>;

	def : Pat<(atomic_store_8 addr:$dst, (i8 imm:$src)),
	(MOV8mi addr:$dst, imm:$src)>;
	def : Pat<(atomic_store_16 addr:$dst, (i16 imm:$src)),
	(MOV16mi addr:$dst, imm:$src)>;
	def : Pat<(atomic_store_32 addr:$dst, (i32 imm:$src)),
	(MOV32mi addr:$dst, imm:$src)>;
	def : Pat<(atomic_store_64 addr:$dst, (i64immSExt32:$src)),
	(MOV64mi32 addr:$dst, i64immSExt32:$src)>;

	def : Pat<(atomic_store_8 addr:$dst, GR8:$src),
	(MOV8mr addr:$dst, GR8:$src)>;
	def : Pat<(atomic_store_16 addr:$dst, GR16:$src),
	(MOV16mr addr:$dst, GR16:$src)>;
	def : Pat<(atomic_store_32 addr:$dst, GR32:$src),
	(MOV32mr addr:$dst, GR32:$src)>;
	def : Pat<(atomic_store_64 addr:$dst, GR64:$src),
	(MOV64mr addr:$dst, GR64:$src)>;

	def : Pat<(i8 (atomic_load_8 addr:$src)), (MOV8rm addr:$src)>;
	def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>;
	def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>;
	def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>;

	// Floating point loads/stores.
	def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
	(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
	def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
	(VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>;
	def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
	(VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>;

	def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
	(MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>;
	def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
	(VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>;
	def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
	(VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>;

	def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
	(MOVSSrm_alt addr:$src)>, Requires<[UseSSE1]>;
	def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
	(VMOVSSrm_alt addr:$src)>, Requires<[UseAVX]>;
	def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
	(VMOVSSZrm_alt addr:$src)>, Requires<[HasAVX512]>;

	def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
	(MOVSDrm_alt addr:$src)>, Requires<[UseSSE2]>;
	def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
	(VMOVSDrm_alt addr:$src)>, Requires<[UseAVX]>;
	def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
	(VMOVSDZrm_alt addr:$src)>, Requires<[HasAVX512]>;

	//===----------------------------------------------------------------------===//
	// DAG Pattern Matching Rules
	//===----------------------------------------------------------------------===//

	// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves
	// binary size compared to a regular MOV, but it introduces an unnecessary
	// load, so is not suitable for regular or optsize functions.
	let Predicates = [OptForMinSize] in {
	def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
	def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
	def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
	def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
	def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
	def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
	}

	// In kernel code model, we can get the address of a label
	// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
	// the MOV64ri32 should accept these.
	def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
	(MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
	(MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
	(MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
	(MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper mcsym:$dst)),
	(MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
	(MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;

	// If we have small model and -static mode, it is safe to store global addresses
	// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
	// for MOV64mi32 should handle this sort of thing.
	def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tconstpool:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tjumptable:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tglobaladdr:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, texternalsym:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, mcsym:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tblockaddress:$src)>,
	Requires<[NearData, IsNotPIC]>;

	def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
	def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;

	// Calls

	// tls has some funny stuff here...
	// This corresponds to movabs $foo@tpoff, %rax
	def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
	(MOV64ri32 tglobaltlsaddr :$dst)>;
	// This corresponds to add $foo@tpoff, %rax
	def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
	(ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;


	// Direct PC relative function call for small code model. 32-bit displacement
	// sign extended to 64-bit.
	def : Pat<(X86call (i64 tglobaladdr:$dst)),
	(CALL64pcrel32 tglobaladdr:$dst)>;
	def : Pat<(X86call (i64 texternalsym:$dst)),
	(CALL64pcrel32 texternalsym:$dst)>;

	// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
	// can never use callee-saved registers. That is the purpose of the GR64_TC
	// register classes.
	//
	// The only volatile register that is never used by the calling convention is
	// %r11. This happens when calling a vararg function with 6 arguments.
	//
	// Match an X86tcret that uses less than 7 volatile registers.
	def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
	(X86tcret node:$ptr, node:$off), [{
	// X86tcret args: (*chain, ptr, imm, regs..., glue)
	unsigned NumRegs = 0;
	for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
	if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
	return false;
	return true;
	}]>;

	def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	(TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
	- Requires<[Not64BitMode, NotUseRetpolineIndirectCalls]>;
	+ Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;

	// FIXME: This is disabled for 32-bit PIC mode because the global base
	// register which is part of the address mode may be assigned a
	// callee-saved register.
	def : Pat<(X86tcret (load addr:$dst), imm:$off),
	(TCRETURNmi addr:$dst, imm:$off)>,
	- Requires<[Not64BitMode, IsNotPIC, NotUseRetpolineIndirectCalls]>;
	+ Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;

	def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
	(TCRETURNdi tglobaladdr:$dst, imm:$off)>,
	Requires<[NotLP64]>;

	def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
	(TCRETURNdi texternalsym:$dst, imm:$off)>,
	Requires<[NotLP64]>;

	def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	(TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
	- Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
	+ Requires<[In64BitMode, NotUseIndirectThunkCalls]>;

	// Don't fold loads into X86tcret requiring more than 6 regs.
	// There wouldn't be enough scratch registers for base+index.
	def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
	(TCRETURNmi64 addr:$dst, imm:$off)>,
	- Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
	+ Requires<[In64BitMode, NotUseIndirectThunkCalls]>;

	def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	- (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
	- Requires<[In64BitMode, UseRetpolineIndirectCalls]>;
	+ (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
	+ Requires<[In64BitMode, UseIndirectThunkCalls]>;

	def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	- (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
	- Requires<[Not64BitMode, UseRetpolineIndirectCalls]>;
	+ (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
	+ Requires<[Not64BitMode, UseIndirectThunkCalls]>;

	def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
	(TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
	Requires<[IsLP64]>;

	def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
	(TCRETURNdi64 texternalsym:$dst, imm:$off)>,
	Requires<[IsLP64]>;

	// Normal calls, with various flavors of addresses.
	def : Pat<(X86call (i32 tglobaladdr:$dst)),
	(CALLpcrel32 tglobaladdr:$dst)>;
	def : Pat<(X86call (i32 texternalsym:$dst)),
	(CALLpcrel32 texternalsym:$dst)>;
	def : Pat<(X86call (i32 imm:$dst)),
	(CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;

	// Comparisons.

	// TEST R,R is smaller than CMP R,0
	def : Pat<(X86cmp GR8:$src1, 0),
	(TEST8rr GR8:$src1, GR8:$src1)>;
	def : Pat<(X86cmp GR16:$src1, 0),
	(TEST16rr GR16:$src1, GR16:$src1)>;
	def : Pat<(X86cmp GR32:$src1, 0),
	(TEST32rr GR32:$src1, GR32:$src1)>;
	def : Pat<(X86cmp GR64:$src1, 0),
	(TEST64rr GR64:$src1, GR64:$src1)>;

	// zextload bool -> zextload byte
	// i1 stored in one byte in zero-extended form.
	// Upper bits cleanup should be executed before Store.
	def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
	def : Pat<(zextloadi16i1 addr:$src),
	(EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
	def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
	def : Pat<(zextloadi64i1 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;

	// extload bool -> extload byte
	// When extloading from 16-bit and smaller memory locations into 64-bit
	// registers, use zero-extending loads so that the entire 64-bit register is
	// defined, avoiding partial-register updates.

	def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
	def : Pat<(extloadi16i1 addr:$src),
	(EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
	def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
	def : Pat<(extloadi16i8 addr:$src),
	(EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
	def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
	def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;

	// For other extloads, use subregs, since the high contents of the register are
	// defined after an extload.
	// NOTE: The extloadi64i32 pattern needs to be first as it will try to form
	// 32-bit loads for 4 byte aligned i8/i16 loads.
	def : Pat<(extloadi64i32 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
	def : Pat<(extloadi64i1 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
	def : Pat<(extloadi64i8 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
	def : Pat<(extloadi64i16 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;

	// anyext. Define these to do an explicit zero-extend to
	// avoid partial-register updates.
	def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
	(MOVZX32rr8 GR8 :$src), sub_16bit)>;
	def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;

	// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
	def : Pat<(i32 (anyext GR16:$src)),
	(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;

	def : Pat<(i64 (anyext GR8 :$src)),
	(SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
	def : Pat<(i64 (anyext GR16:$src)),
	(SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
	def : Pat<(i64 (anyext GR32:$src)),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>;

	// If this is an anyext of the remainder of an 8-bit sdivrem, use a MOVSX
	// instead of a MOVZX. The sdivrem lowering will emit emit a MOVSX to move
	// %ah to the lower byte of a register. By using a MOVSX here we allow a
	// post-isel peephole to merge the two MOVSX instructions into one.
	def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{
	return (N->getOperand(0).getOpcode() == ISD::SDIVREM &&
	N->getOperand(0).getResNo() == 1);
	}]>;
	def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>;

	// Any instruction that defines a 32-bit result leaves the high half of the
	// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
	// be copying from a truncate. Any other 32-bit operation will zero-extend
	// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
	// 32 bits, they're probably just qualifying a CopyFromReg.
	def def32 : PatLeaf<(i32 GR32:$src), [{
	return N->getOpcode() != ISD::TRUNCATE &&
	N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
	N->getOpcode() != ISD::CopyFromReg &&
	N->getOpcode() != ISD::AssertSext &&
	N->getOpcode() != ISD::AssertZext;
	}]>;

	// In the case of a 32-bit def that is known to implicitly zero-extend,
	// we can use a SUBREG_TO_REG.
	def : Pat<(i64 (zext def32:$src)),
	(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
	def : Pat<(i64 (and (anyext def32:$src), 0x00000000FFFFFFFF)),
	(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;

	//===----------------------------------------------------------------------===//
	// Pattern match OR as ADD
	//===----------------------------------------------------------------------===//

	// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
	// 3-addressified into an LEA instruction to avoid copies. However, we also
	// want to finally emit these instructions as an or at the end of the code
	// generator to make the generated code easier to read. To do this, we select
	// into "disjoint bits" pseudo ops.

	// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
	def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
	return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());

	KnownBits Known0 = CurDAG->computeKnownBits(N->getOperand(0), 0);
	KnownBits Known1 = CurDAG->computeKnownBits(N->getOperand(1), 0);
	return (~Known0.Zero & ~Known1.Zero) == 0;
	}]>;


	// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
	// Try this before the selecting to OR.
	let SchedRW = [WriteALU] in {

	let isConvertibleToThreeAddress = 1, isPseudo = 1,
	Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
	let isCommutable = 1 in {
	def ADD8rr_DB : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
	"", // orb/addb REG, REG
	[(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>;
	def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"", // orw/addw REG, REG
	[(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
	def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"", // orl/addl REG, REG
	[(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
	def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"", // orq/addq REG, REG
	[(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
	} // isCommutable

	// NOTE: These are order specific, we want the ri8 forms to be listed
	// first so that they are slightly preferred to the ri forms.

	def ADD8ri_DB : I<0, Pseudo,
	(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
	"", // orb/addb REG, imm8
	[(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>;
	def ADD16ri8_DB : I<0, Pseudo,
	(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
	"", // orw/addw REG, imm8
	[(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
	def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
	"", // orw/addw REG, imm
	[(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;

	def ADD32ri8_DB : I<0, Pseudo,
	(outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
	"", // orl/addl REG, imm8
	[(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
	def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
	"", // orl/addl REG, imm
	[(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;


	def ADD64ri8_DB : I<0, Pseudo,
	(outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
	"", // orq/addq REG, imm8
	[(set GR64:$dst, (or_is_add GR64:$src1,
	i64immSExt8:$src2))]>;
	def ADD64ri32_DB : I<0, Pseudo,
	(outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
	"", // orq/addq REG, imm
	[(set GR64:$dst, (or_is_add GR64:$src1,
	i64immSExt32:$src2))]>;
	}
	} // AddedComplexity, SchedRW

	//===----------------------------------------------------------------------===//
	// Pattern match SUB as XOR
	//===----------------------------------------------------------------------===//

	// An immediate in the LHS of a subtract can't be encoded in the instruction.
	// If there is no possibility of a borrow we can use an XOR instead of a SUB
	// to enable the immediate to be folded.
	// TODO: Move this to a DAG combine?

	def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
	KnownBits Known = CurDAG->computeKnownBits(N->getOperand(1));

	// If all possible ones in the RHS are set in the LHS then there can't be
	// a borrow and we can use xor.
	return (~Known.Zero).isSubsetOf(CN->getAPIntValue());
	}

	return false;
	}]>;

	let AddedComplexity = 5 in {
	def : Pat<(sub_is_xor imm:$src2, GR8:$src1),
	(XOR8ri GR8:$src1, imm:$src2)>;
	def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1),
	(XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(sub_is_xor imm:$src2, GR16:$src1),
	(XOR16ri GR16:$src1, imm:$src2)>;
	def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1),
	(XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(sub_is_xor imm:$src2, GR32:$src1),
	(XOR32ri GR32:$src1, imm:$src2)>;
	def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1),
	(XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1),
	(XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
	}

	//===----------------------------------------------------------------------===//
	// Some peepholes
	//===----------------------------------------------------------------------===//

	// Odd encoding trick: -128 fits into an 8-bit immediate field while
	// +128 doesn't, so in this special case use a sub instead of an add.
	def : Pat<(add GR16:$src1, 128),
	(SUB16ri8 GR16:$src1, -128)>;
	def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
	(SUB16mi8 addr:$dst, -128)>;

	def : Pat<(add GR32:$src1, 128),
	(SUB32ri8 GR32:$src1, -128)>;
	def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
	(SUB32mi8 addr:$dst, -128)>;

	def : Pat<(add GR64:$src1, 128),
	(SUB64ri8 GR64:$src1, -128)>;
	def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
	(SUB64mi8 addr:$dst, -128)>;

	def : Pat<(X86add_flag_nocf GR16:$src1, 128),
	(SUB16ri8 GR16:$src1, -128)>;
	def : Pat<(X86add_flag_nocf GR32:$src1, 128),
	(SUB32ri8 GR32:$src1, -128)>;
	def : Pat<(X86add_flag_nocf GR64:$src1, 128),
	(SUB64ri8 GR64:$src1, -128)>;

	// The same trick applies for 32-bit immediate fields in 64-bit
	// instructions.
	def : Pat<(add GR64:$src1, 0x0000000080000000),
	(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
	def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
	(SUB64mi32 addr:$dst, 0xffffffff80000000)>;

	def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
	(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;

	// To avoid needing to materialize an immediate in a register, use a 32-bit and
	// with implicit zero-extension instead of a 64-bit and if the immediate has at
	// least 32 bits of leading zeros. If in addition the last 32 bits can be
	// represented with a sign extension of a 8 bit constant, use that.
	// This can also reduce instruction size by eliminating the need for the REX
	// prefix.

	// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
	let AddedComplexity = 1 in {
	def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
	(SUBREG_TO_REG
	(i64 0),
	(AND32ri8
	(EXTRACT_SUBREG GR64:$src, sub_32bit),
	(i32 (GetLo32XForm imm:$imm))),
	sub_32bit)>;

	def : Pat<(and GR64:$src, i64immZExt32:$imm),
	(SUBREG_TO_REG
	(i64 0),
	(AND32ri
	(EXTRACT_SUBREG GR64:$src, sub_32bit),
	(i32 (GetLo32XForm imm:$imm))),
	sub_32bit)>;
	} // AddedComplexity = 1


	// AddedComplexity is needed due to the increased complexity on the
	// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
	// the MOVZX patterns keeps thems together in DAGIsel tables.
	let AddedComplexity = 1 in {
	// r & (2^16-1) ==> movz
	def : Pat<(and GR32:$src1, 0xffff),
	(MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
	// r & (2^8-1) ==> movz
	def : Pat<(and GR32:$src1, 0xff),
	(MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>;
	// r & (2^8-1) ==> movz
	def : Pat<(and GR16:$src1, 0xff),
	(EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)),
	sub_16bit)>;

	// r & (2^32-1) ==> movz
	def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
	(SUBREG_TO_REG (i64 0),
	(MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
	sub_32bit)>;
	// r & (2^16-1) ==> movz
	def : Pat<(and GR64:$src, 0xffff),
	(SUBREG_TO_REG (i64 0),
	(MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
	sub_32bit)>;
	// r & (2^8-1) ==> movz
	def : Pat<(and GR64:$src, 0xff),
	(SUBREG_TO_REG (i64 0),
	(MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
	sub_32bit)>;
	} // AddedComplexity = 1


	// Try to use BTS/BTR/BTC for single bit operations on the upper 32-bits.

	def BTRXForm : SDNodeXForm<imm, [{
	// Transformation function: Find the lowest 0.
	return getI64Imm((uint8_t)N->getAPIntValue().countTrailingOnes(), SDLoc(N));
	}]>;

	def BTCBTSXForm : SDNodeXForm<imm, [{
	// Transformation function: Find the lowest 1.
	return getI64Imm((uint8_t)N->getAPIntValue().countTrailingZeros(), SDLoc(N));
	}]>;

	def BTRMask64 : ImmLeaf<i64, [{
	return !isUInt<32>(Imm) && !isInt<32>(Imm) && isPowerOf2_64(~Imm);
	}]>;

	def BTCBTSMask64 : ImmLeaf<i64, [{
	return !isInt<32>(Imm) && isPowerOf2_64(Imm);
	}]>;

	// For now only do this for optsize.
	let AddedComplexity = 1, Predicates=[OptForSize] in {
	def : Pat<(and GR64:$src1, BTRMask64:$mask),
	(BTR64ri8 GR64:$src1, (BTRXForm imm:$mask))>;
	def : Pat<(or GR64:$src1, BTCBTSMask64:$mask),
	(BTS64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
	def : Pat<(xor GR64:$src1, BTCBTSMask64:$mask),
	(BTC64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
	}


	// sext_inreg patterns
	def : Pat<(sext_inreg GR32:$src, i16),
	(MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
	def : Pat<(sext_inreg GR32:$src, i8),
	(MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>;

	def : Pat<(sext_inreg GR16:$src, i8),
	(EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)),
	sub_16bit)>;

	def : Pat<(sext_inreg GR64:$src, i32),
	(MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
	def : Pat<(sext_inreg GR64:$src, i16),
	(MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
	def : Pat<(sext_inreg GR64:$src, i8),
	(MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;

	// sext, sext_load, zext, zext_load
	def: Pat<(i16 (sext GR8:$src)),
	(EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
	def: Pat<(sextloadi16i8 addr:$src),
	(EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
	def: Pat<(i16 (zext GR8:$src)),
	(EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
	def: Pat<(zextloadi16i8 addr:$src),
	(EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;

	// trunc patterns
	def : Pat<(i16 (trunc GR32:$src)),
	(EXTRACT_SUBREG GR32:$src, sub_16bit)>;
	def : Pat<(i8 (trunc GR32:$src)),
	(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
	sub_8bit)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i8 (trunc GR16:$src)),
	(EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
	sub_8bit)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i32 (trunc GR64:$src)),
	(EXTRACT_SUBREG GR64:$src, sub_32bit)>;
	def : Pat<(i16 (trunc GR64:$src)),
	(EXTRACT_SUBREG GR64:$src, sub_16bit)>;
	def : Pat<(i8 (trunc GR64:$src)),
	(EXTRACT_SUBREG GR64:$src, sub_8bit)>;
	def : Pat<(i8 (trunc GR32:$src)),
	(EXTRACT_SUBREG GR32:$src, sub_8bit)>,
	Requires<[In64BitMode]>;
	def : Pat<(i8 (trunc GR16:$src)),
	(EXTRACT_SUBREG GR16:$src, sub_8bit)>,
	Requires<[In64BitMode]>;

	def immff00_ffff : ImmLeaf<i32, [{
	return Imm >= 0xff00 && Imm <= 0xffff;
	}]>;

	// h-register tricks
	def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
	(EXTRACT_SUBREG GR32:$src, sub_8bit_hi)>,
	Requires<[Not64BitMode]>;
	def : Pat<(srl GR16:$src, (i8 8)),
	(EXTRACT_SUBREG
	(MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
	sub_16bit)>;
	def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
	(MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
	def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
	(MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
	def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
	(MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
	def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)),
	(MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;

	// h-register tricks.
	// For now, be conservative on x86-64 and use an h-register extract only if the
	// value is immediately zero-extended or stored, which are somewhat common
	// cases. This uses a bunch of code to prevent a register requiring a REX prefix
	// from being allocated in the same instruction as the h register, as there's
	// currently no way to describe this requirement to the register allocator.

	// h-register extract and zero-extend.
	def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
	(SUBREG_TO_REG
	(i64 0),
	(MOVZX32rr8_NOREX
	(EXTRACT_SUBREG GR64:$src, sub_8bit_hi)),
	sub_32bit)>;
	def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
	(SUBREG_TO_REG
	(i64 0),
	(MOVZX32rr8_NOREX
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
	sub_32bit)>;
	def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
	(SUBREG_TO_REG
	(i64 0),
	(MOVZX32rr8_NOREX
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
	sub_32bit)>;

	// h-register extract and store.
	def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
	(MOV8mr_NOREX
	addr:$dst,
	(EXTRACT_SUBREG GR64:$src, sub_8bit_hi))>;
	def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
	(MOV8mr_NOREX
	addr:$dst,
	(EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>,
	Requires<[In64BitMode]>;
	def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
	(MOV8mr_NOREX
	addr:$dst,
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
	Requires<[In64BitMode]>;


	// (shl x, 1) ==> (add x, x)
	// Note that if x is undef (immediate or otherwise), we could theoretically
	// end up with the two uses of x getting different values, producing a result
	// where the least significant bit is not 0. However, the probability of this
	// happening is considered low enough that this is officially not a
	// "real problem".
	def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
	def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
	def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
	def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;

	def shiftMask8 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
	return isUnneededShiftMask(N, 3);
	}]>;

	def shiftMask16 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
	return isUnneededShiftMask(N, 4);
	}]>;

	def shiftMask32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
	return isUnneededShiftMask(N, 5);
	}]>;

	def shiftMask64 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
	return isUnneededShiftMask(N, 6);
	}]>;


	// Shift amount is implicitly masked.
	multiclass MaskedShiftAmountPats<SDNode frag, string name> {
	// (shift x (and y, 31)) ==> (shift x, y)
	def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
	(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
	def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
	(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
	def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
	(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
	def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst),
	(!cast<Instruction>(name # "8mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst),
	(!cast<Instruction>(name # "16mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
	(!cast<Instruction>(name # "32mCL") addr:$dst)>;

	// (shift x (and y, 63)) ==> (shift x, y)
	def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
	(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
	def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
	(!cast<Instruction>(name # "64mCL") addr:$dst)>;
	}

	defm : MaskedShiftAmountPats<shl, "SHL">;
	defm : MaskedShiftAmountPats<srl, "SHR">;
	defm : MaskedShiftAmountPats<sra, "SAR">;

	// ROL/ROR instructions allow a stronger mask optimization than shift for 8- and
	// 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount
	// because over-rotating produces the same result. This is noted in the Intel
	// docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation
	// amount could affect EFLAGS results, but that does not matter because we are
	// not tracking flags for these nodes.
	multiclass MaskedRotateAmountPats<SDNode frag, string name> {
	// (rot x (and y, BitWidth - 1)) ==> (rot x, y)
	def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
	(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
	def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
	(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
	def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
	(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
	def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst),
	(!cast<Instruction>(name # "8mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst),
	(!cast<Instruction>(name # "16mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
	(!cast<Instruction>(name # "32mCL") addr:$dst)>;

	// (rot x (and y, 63)) ==> (rot x, y)
	def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
	(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
	def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
	(!cast<Instruction>(name # "64mCL") addr:$dst)>;
	}


	defm : MaskedRotateAmountPats<rotl, "ROL">;
	defm : MaskedRotateAmountPats<rotr, "ROR">;

	// Double shift amount is implicitly masked.
	multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
	// (shift x (and y, 31)) ==> (shift x, y)
	def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
	(!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
	def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
	(!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;

	// (shift x (and y, 63)) ==> (shift x, y)
	def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
	(!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
	}

	defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
	defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;

	let Predicates = [HasBMI2] in {
	let AddedComplexity = 1 in {
	def : Pat<(sra GR32:$src1, (shiftMask32 GR8:$src2)),
	(SARX32rr GR32:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(sra GR64:$src1, (shiftMask64 GR8:$src2)),
	(SARX64rr GR64:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(srl GR32:$src1, (shiftMask32 GR8:$src2)),
	(SHRX32rr GR32:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(srl GR64:$src1, (shiftMask64 GR8:$src2)),
	(SHRX64rr GR64:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(shl GR32:$src1, (shiftMask32 GR8:$src2)),
	(SHLX32rr GR32:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(shl GR64:$src1, (shiftMask64 GR8:$src2)),
	(SHLX64rr GR64:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	}

	def : Pat<(sra (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
	(SARX32rm addr:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(sra (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
	(SARX64rm addr:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(srl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
	(SHRX32rm addr:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(srl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
	(SHRX64rm addr:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(shl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
	(SHLX32rm addr:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(shl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
	(SHLX64rm addr:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	}

	// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
	multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
	Instruction BTS, Instruction BTC,
	PatFrag ShiftMask> {
	def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
	(BTR RC:$src1,
	(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(or RC:$src1, (shl 1, GR8:$src2)),
	(BTS RC:$src1,
	(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(xor RC:$src1, (shl 1, GR8:$src2)),
	(BTC RC:$src1,
	(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	// Similar to above, but removing unneeded masking of the shift amount.
	def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))),
	(BTR RC:$src1,
	(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
	(BTS RC:$src1,
	(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
	(BTC RC:$src1,
	(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	}

	defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
	defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
	defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;


	// (anyext (setcc_carry)) -> (setcc_carry)
	def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C16r)>;
	def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;
	def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;

	//===----------------------------------------------------------------------===//
	// EFLAGS-defining Patterns
	//===----------------------------------------------------------------------===//

	// add reg, reg
	def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>;

	// add reg, mem
	def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
	(ADD8rm GR8:$src1, addr:$src2)>;
	def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
	(ADD16rm GR16:$src1, addr:$src2)>;
	def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
	(ADD32rm GR32:$src1, addr:$src2)>;
	def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
	(ADD64rm GR64:$src1, addr:$src2)>;

	// add reg, imm
	def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
	def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
	def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
	def : Pat<(add GR16:$src1, i16immSExt8:$src2),
	(ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(add GR32:$src1, i32immSExt8:$src2),
	(ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(add GR64:$src1, i64immSExt8:$src2),
	(ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(add GR64:$src1, i64immSExt32:$src2),
	(ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// sub reg, reg
	def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>;

	// sub reg, mem
	def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
	(SUB8rm GR8:$src1, addr:$src2)>;
	def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
	(SUB16rm GR16:$src1, addr:$src2)>;
	def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
	(SUB32rm GR32:$src1, addr:$src2)>;
	def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
	(SUB64rm GR64:$src1, addr:$src2)>;

	// sub reg, imm
	def : Pat<(sub GR8:$src1, imm:$src2),
	(SUB8ri GR8:$src1, imm:$src2)>;
	def : Pat<(sub GR16:$src1, imm:$src2),
	(SUB16ri GR16:$src1, imm:$src2)>;
	def : Pat<(sub GR32:$src1, imm:$src2),
	(SUB32ri GR32:$src1, imm:$src2)>;
	def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
	(SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
	(SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
	(SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
	(SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// sub 0, reg
	def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
	def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
	def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
	def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;

	// sub reg, relocImm
	def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
	(SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;

	// mul reg, reg
	def : Pat<(mul GR16:$src1, GR16:$src2),
	(IMUL16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(mul GR32:$src1, GR32:$src2),
	(IMUL32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(mul GR64:$src1, GR64:$src2),
	(IMUL64rr GR64:$src1, GR64:$src2)>;

	// mul reg, mem
	def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
	(IMUL16rm GR16:$src1, addr:$src2)>;
	def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
	(IMUL32rm GR32:$src1, addr:$src2)>;
	def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
	(IMUL64rm GR64:$src1, addr:$src2)>;

	// mul reg, imm
	def : Pat<(mul GR16:$src1, imm:$src2),
	(IMUL16rri GR16:$src1, imm:$src2)>;
	def : Pat<(mul GR32:$src1, imm:$src2),
	(IMUL32rri GR32:$src1, imm:$src2)>;
	def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
	(IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
	(IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
	(IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
	(IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;

	// reg = mul mem, imm
	def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
	(IMUL16rmi addr:$src1, imm:$src2)>;
	def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
	(IMUL32rmi addr:$src1, imm:$src2)>;
	def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
	(IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
	def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
	(IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
	def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
	(IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
	def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
	(IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;

	// Increment/Decrement reg.
	// Do not make INC/DEC if it is slow
	let Predicates = [UseIncDec] in {
	def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
	def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
	def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
	def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
	def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>;
	def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
	def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
	def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;

	def : Pat<(X86add_flag_nocf GR8:$src, -1), (DEC8r GR8:$src)>;
	def : Pat<(X86add_flag_nocf GR16:$src, -1), (DEC16r GR16:$src)>;
	def : Pat<(X86add_flag_nocf GR32:$src, -1), (DEC32r GR32:$src)>;
	def : Pat<(X86add_flag_nocf GR64:$src, -1), (DEC64r GR64:$src)>;
	def : Pat<(X86sub_flag_nocf GR8:$src, -1), (INC8r GR8:$src)>;
	def : Pat<(X86sub_flag_nocf GR16:$src, -1), (INC16r GR16:$src)>;
	def : Pat<(X86sub_flag_nocf GR32:$src, -1), (INC32r GR32:$src)>;
	def : Pat<(X86sub_flag_nocf GR64:$src, -1), (INC64r GR64:$src)>;
	}

	// or reg/reg.
	def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;

	// or reg/mem
	def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
	(OR8rm GR8:$src1, addr:$src2)>;
	def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
	(OR16rm GR16:$src1, addr:$src2)>;
	def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
	(OR32rm GR32:$src1, addr:$src2)>;
	def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
	(OR64rm GR64:$src1, addr:$src2)>;

	// or reg/imm
	def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
	def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
	def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
	def : Pat<(or GR16:$src1, i16immSExt8:$src2),
	(OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(or GR32:$src1, i32immSExt8:$src2),
	(OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(or GR64:$src1, i64immSExt8:$src2),
	(OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(or GR64:$src1, i64immSExt32:$src2),
	(OR64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// xor reg/reg
	def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;

	// xor reg/mem
	def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
	(XOR8rm GR8:$src1, addr:$src2)>;
	def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
	(XOR16rm GR16:$src1, addr:$src2)>;
	def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
	(XOR32rm GR32:$src1, addr:$src2)>;
	def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
	(XOR64rm GR64:$src1, addr:$src2)>;

	// xor reg/imm
	def : Pat<(xor GR8:$src1, imm:$src2),
	(XOR8ri GR8:$src1, imm:$src2)>;
	def : Pat<(xor GR16:$src1, imm:$src2),
	(XOR16ri GR16:$src1, imm:$src2)>;
	def : Pat<(xor GR32:$src1, imm:$src2),
	(XOR32ri GR32:$src1, imm:$src2)>;
	def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
	(XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
	(XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
	(XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
	(XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// and reg/reg
	def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;

	// and reg/mem
	def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
	(AND8rm GR8:$src1, addr:$src2)>;
	def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
	(AND16rm GR16:$src1, addr:$src2)>;
	def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
	(AND32rm GR32:$src1, addr:$src2)>;
	def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
	(AND64rm GR64:$src1, addr:$src2)>;

	// and reg/imm
	def : Pat<(and GR8:$src1, imm:$src2),
	(AND8ri GR8:$src1, imm:$src2)>;
	def : Pat<(and GR16:$src1, imm:$src2),
	(AND16ri GR16:$src1, imm:$src2)>;
	def : Pat<(and GR32:$src1, imm:$src2),
	(AND32ri GR32:$src1, imm:$src2)>;
	def : Pat<(and GR16:$src1, i16immSExt8:$src2),
	(AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(and GR32:$src1, i32immSExt8:$src2),
	(AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(and GR64:$src1, i64immSExt8:$src2),
	(AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(and GR64:$src1, i64immSExt32:$src2),
	(AND64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// Bit scan instruction patterns to match explicit zero-undef behavior.
	def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
	def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
	def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
	def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
	def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
	def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;

	// When HasMOVBE is enabled it is possible to get a non-legalized
	// register-register 16 bit bswap. This maps it to a ROL instruction.
	let Predicates = [HasMOVBE] in {
	def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td (revision 362609)
	@@ -1,425 +1,425 @@
	//===-- X86InstrControl.td - Control Flow Instructions ------ tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 jump, return, call, and related instructions.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Control Flow Instructions.
	//

	// Return instructions.
	//
	// The X86retflag return instructions are variadic because we may add ST0 and
	// ST1 arguments when returning values on the x87 stack.
	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
	def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
	"ret{l}", []>, OpSize32, Requires<[Not64BitMode]>;
	def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
	"ret{q}", []>, OpSize32, Requires<[In64BitMode]>;
	def RETW : I <0xC3, RawFrm, (outs), (ins),
	"ret{w}", []>, OpSize16;
	def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
	"ret{l}\t$amt", []>, OpSize32, Requires<[Not64BitMode]>;
	def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
	"ret{q}\t$amt", []>, OpSize32, Requires<[In64BitMode]>;
	def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
	"ret{w}\t$amt", []>, OpSize16;
	def LRETL : I <0xCB, RawFrm, (outs), (ins),
	"{l}ret{l\|f}", []>, OpSize32;
	def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
	"{l}ret{\|f}q", []>, Requires<[In64BitMode]>;
	def LRETW : I <0xCB, RawFrm, (outs), (ins),
	"{l}ret{w\|f}", []>, OpSize16;
	def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{l\|f}\t$amt", []>, OpSize32;
	def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{\|f}q\t$amt", []>, Requires<[In64BitMode]>;
	def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{w\|f}\t$amt", []>, OpSize16;

	// The machine return from interrupt instruction, but sometimes we need to
	// perform a post-epilogue stack adjustment. Codegen emits the pseudo form
	// which expands to include an SP adjustment if necessary.
	def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", []>,
	OpSize16;
	def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l\|d}", []>, OpSize32;
	def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", []>, Requires<[In64BitMode]>;
	let isCodeGenOnly = 1 in
	def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
	def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
	}

	// Unconditional branches.
	let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
	def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
	"jmp\t$dst", [(br bb:$dst)]>;
	let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
	"jmp\t$dst", []>, OpSize16;
	def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
	"jmp\t$dst", []>, OpSize32;
	}
	}

	// Conditional Branches.
	let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
	isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
	(ins brtarget8:$dst, ccode:$cond),
	"j${cond}\t$dst",
	[(X86brcond bb:$dst, timm:$cond, EFLAGS)]>;
	let hasSideEffects = 0 in {
	def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
	(ins brtarget16:$dst, ccode:$cond),
	"j${cond}\t$dst",
	[]>, OpSize16, TB;
	def JCC_4 : Ii32PCRel<0x80, AddCCFrm, (outs),
	(ins brtarget32:$dst, ccode:$cond),
	"j${cond}\t$dst",
	[]>, TB, OpSize32;
	}
	}

	def : InstAlias<"jo\t$dst", (JCC_1 brtarget8:$dst, 0), 0>;
	def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst, 1), 0>;
	def : InstAlias<"jb\t$dst", (JCC_1 brtarget8:$dst, 2), 0>;
	def : InstAlias<"jae\t$dst", (JCC_1 brtarget8:$dst, 3), 0>;
	def : InstAlias<"je\t$dst", (JCC_1 brtarget8:$dst, 4), 0>;
	def : InstAlias<"jne\t$dst", (JCC_1 brtarget8:$dst, 5), 0>;
	def : InstAlias<"jbe\t$dst", (JCC_1 brtarget8:$dst, 6), 0>;
	def : InstAlias<"ja\t$dst", (JCC_1 brtarget8:$dst, 7), 0>;
	def : InstAlias<"js\t$dst", (JCC_1 brtarget8:$dst, 8), 0>;
	def : InstAlias<"jns\t$dst", (JCC_1 brtarget8:$dst, 9), 0>;
	def : InstAlias<"jp\t$dst", (JCC_1 brtarget8:$dst, 10), 0>;
	def : InstAlias<"jnp\t$dst", (JCC_1 brtarget8:$dst, 11), 0>;
	def : InstAlias<"jl\t$dst", (JCC_1 brtarget8:$dst, 12), 0>;
	def : InstAlias<"jge\t$dst", (JCC_1 brtarget8:$dst, 13), 0>;
	def : InstAlias<"jle\t$dst", (JCC_1 brtarget8:$dst, 14), 0>;
	def : InstAlias<"jg\t$dst", (JCC_1 brtarget8:$dst, 15), 0>;

	// jcx/jecx/jrcx instructions.
	let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
	// These are the 32-bit versions of this instruction for the asmparser. In
	// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
	// jecxz.
	let Uses = [CX] in
	def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jcxz\t$dst", []>, AdSize16, Requires<[Not64BitMode]>;
	let Uses = [ECX] in
	def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jecxz\t$dst", []>, AdSize32;

	let Uses = [RCX] in
	def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jrcxz\t$dst", []>, AdSize64, Requires<[In64BitMode]>;
	}

	// Indirect branches
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
	[(brind GR16:$dst)]>, Requires<[Not64BitMode]>,
	OpSize16, Sched<[WriteJump]>;
	def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
	[(brind (loadi16 addr:$dst))]>, Requires<[Not64BitMode]>,
	OpSize16, Sched<[WriteJumpLd]>;

	def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
	[(brind GR32:$dst)]>, Requires<[Not64BitMode]>,
	OpSize32, Sched<[WriteJump]>;
	def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
	[(brind (loadi32 addr:$dst))]>, Requires<[Not64BitMode]>,
	OpSize32, Sched<[WriteJumpLd]>;

	def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
	[(brind GR64:$dst)]>, Requires<[In64BitMode]>,
	Sched<[WriteJump]>;
	def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
	[(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
	Sched<[WriteJumpLd]>;

	// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
	// These are switched from TAILJMPr/m64_REX in MCInstLower.
	let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
	def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
	"rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
	let mayLoad = 1 in
	def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst),
	"rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>;

	}

	// Non-tracking jumps for IBT, use with caution.
	let isCodeGenOnly = 1 in {
	def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
	[(X86NoTrackBrind GR16 : $dst)]>, Requires<[Not64BitMode]>,
	OpSize16, Sched<[WriteJump]>, NOTRACK;

	def JMP16m_NT : I<0xFF, MRM4m, (outs), (ins i16mem : $dst), "jmp{w}\t{*}$dst",
	[(X86NoTrackBrind (loadi16 addr : $dst))]>,
	Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>,
	NOTRACK;

	def JMP32r_NT : I<0xFF, MRM4r, (outs), (ins GR32 : $dst), "jmp{l}\t{*}$dst",
	[(X86NoTrackBrind GR32 : $dst)]>, Requires<[Not64BitMode]>,
	OpSize32, Sched<[WriteJump]>, NOTRACK;
	def JMP32m_NT : I<0xFF, MRM4m, (outs), (ins i32mem : $dst), "jmp{l}\t{*}$dst",
	[(X86NoTrackBrind (loadi32 addr : $dst))]>,
	Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>,
	NOTRACK;

	def JMP64r_NT : I<0xFF, MRM4r, (outs), (ins GR64 : $dst), "jmp{q}\t{*}$dst",
	[(X86NoTrackBrind GR64 : $dst)]>, Requires<[In64BitMode]>,
	Sched<[WriteJump]>, NOTRACK;
	def JMP64m_NT : I<0xFF, MRM4m, (outs), (ins i64mem : $dst), "jmp{q}\t{*}$dst",
	[(X86NoTrackBrind(loadi64 addr : $dst))]>,
	Requires<[In64BitMode]>, Sched<[WriteJumpLd]>, NOTRACK;
	}

	let Predicates = [Not64BitMode], AsmVariantName = "att" in {
	def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
	(ins i16imm:$off, i16imm:$seg),
	"ljmp{w}\t$seg, $off", []>,
	OpSize16, Sched<[WriteJump]>;
	def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
	(ins i32imm:$off, i16imm:$seg),
	"ljmp{l}\t$seg, $off", []>,
	OpSize32, Sched<[WriteJump]>;
	}
	def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
	"ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;

	let AsmVariantName = "att" in
	def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
	"ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
	def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
	"{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
	}

	// Loop instructions
	let SchedRW = [WriteJump] in {
	def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
	def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
	def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
	}

	//===----------------------------------------------------------------------===//
	// Call Instructions...
	//
	let isCall = 1 in
	// All calls clobber the non-callee saved registers. ESP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead. Uses for argument
	// registers are added manually.
	let Uses = [ESP, SSP] in {
	def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
	(outs), (ins i32imm_brtarget:$dst),
	"call{l}\t$dst", []>, OpSize32,
	Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	let hasSideEffects = 0 in
	def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
	(outs), (ins i16imm_brtarget:$dst),
	"call{w}\t$dst", []>, OpSize16,
	Sched<[WriteJump]>;
	def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
	"call{w}\t{*}$dst", [(X86call GR16:$dst)]>,
	OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
	"call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))]>,
	OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
	Sched<[WriteJumpLd]>;
	def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
	"call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
	- Requires<[Not64BitMode,NotUseRetpolineIndirectCalls]>,
	+ Requires<[Not64BitMode,NotUseIndirectThunkCalls]>,
	Sched<[WriteJump]>;
	def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
	"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
	OpSize32,
	Requires<[Not64BitMode,FavorMemIndirectCall,
	- NotUseRetpolineIndirectCalls]>,
	+ NotUseIndirectThunkCalls]>,
	Sched<[WriteJumpLd]>;

	// Non-tracking calls for IBT, use with caution.
	let isCodeGenOnly = 1 in {
	def CALL16r_NT : I<0xFF, MRM2r, (outs), (ins GR16 : $dst),
	"call{w}\t{*}$dst",[(X86NoTrackCall GR16 : $dst)]>,
	OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
	def CALL16m_NT : I<0xFF, MRM2m, (outs), (ins i16mem : $dst),
	"call{w}\t{*}$dst",[(X86NoTrackCall(loadi16 addr : $dst))]>,
	OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
	Sched<[WriteJumpLd]>, NOTRACK;
	def CALL32r_NT : I<0xFF, MRM2r, (outs), (ins GR32 : $dst),
	"call{l}\t{*}$dst",[(X86NoTrackCall GR32 : $dst)]>,
	OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
	def CALL32m_NT : I<0xFF, MRM2m, (outs), (ins i32mem : $dst),
	"call{l}\t{*}$dst",[(X86NoTrackCall(loadi32 addr : $dst))]>,
	OpSize32, Requires<[Not64BitMode,FavorMemIndirectCall]>,
	Sched<[WriteJumpLd]>, NOTRACK;
	}

	let Predicates = [Not64BitMode], AsmVariantName = "att" in {
	def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
	(ins i16imm:$off, i16imm:$seg),
	"lcall{w}\t$seg, $off", []>,
	OpSize16, Sched<[WriteJump]>;
	def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
	(ins i32imm:$off, i16imm:$seg),
	"lcall{l}\t$seg, $off", []>,
	OpSize32, Sched<[WriteJump]>;
	}

	def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
	"lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
	def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
	"{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
	}


	// Tail call stuff.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
	isCodeGenOnly = 1, Uses = [ESP, SSP] in {
	def TCRETURNdi : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$offset),
	[]>, Sched<[WriteJump]>, NotMemoryFoldable;
	def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
	[]>, Sched<[WriteJump]>, NotMemoryFoldable;
	let mayLoad = 1 in
	def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
	[]>, Sched<[WriteJumpLd]>;

	def TAILJMPd : PseudoI<(outs), (ins i32imm_brtarget:$dst),
	[]>, Sched<[WriteJump]>;

	def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
	[]>, Sched<[WriteJump]>;
	let mayLoad = 1 in
	def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst),
	[]>, Sched<[WriteJumpLd]>;
	}

	// Conditional tail calls are similar to the above, but they are branches
	// rather than barriers, and they use EFLAGS.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
	isCodeGenOnly = 1, SchedRW = [WriteJump] in
	let Uses = [ESP, EFLAGS, SSP] in {
	def TCRETURNdicc : PseudoI<(outs),
	(ins i32imm_brtarget:$dst, i32imm:$offset, i32imm:$cond),
	[]>;

	// This gets substituted to a conditional jump instruction in MC lowering.
	def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$cond), []>;
	}


	//===----------------------------------------------------------------------===//
	// Call Instructions...
	//

	// RSP is marked as a use to prevent stack-pointer assignments that appear
	// immediately before calls from potentially appearing dead. Uses for argument
	// registers are added manually.
	let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
	// NOTE: this pattern doesn't match "X86call imm", because we do not know
	// that the offset between an arbitrary immediate and the call will fit in
	// the 32-bit pcrel field that we have.
	def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
	(outs), (ins i64i32imm_brtarget:$dst),
	"call{q}\t$dst", []>, OpSize32,
	Requires<[In64BitMode]>;
	def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
	"call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
	- Requires<[In64BitMode,NotUseRetpolineIndirectCalls]>;
	+ Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
	def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
	"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
	Requires<[In64BitMode,FavorMemIndirectCall,
	- NotUseRetpolineIndirectCalls]>;
	+ NotUseIndirectThunkCalls]>;

	// Non-tracking calls for IBT, use with caution.
	let isCodeGenOnly = 1 in {
	def CALL64r_NT : I<0xFF, MRM2r, (outs), (ins GR64 : $dst),
	"call{q}\t{*}$dst",[(X86NoTrackCall GR64 : $dst)]>,
	Requires<[In64BitMode]>, NOTRACK;
	def CALL64m_NT : I<0xFF, MRM2m, (outs), (ins i64mem : $dst),
	"call{q}\t{*}$dst",
	[(X86NoTrackCall(loadi64 addr : $dst))]>,
	Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK;
	}

	def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
	"lcall{q}\t{*}$dst", []>;
	}

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
	isCodeGenOnly = 1, Uses = [RSP, SSP] in {
	def TCRETURNdi64 : PseudoI<(outs),
	(ins i64i32imm_brtarget:$dst, i32imm:$offset),
	[]>, Sched<[WriteJump]>;
	def TCRETURNri64 : PseudoI<(outs),
	(ins ptr_rc_tailcall:$dst, i32imm:$offset),
	[]>, Sched<[WriteJump]>, NotMemoryFoldable;
	let mayLoad = 1 in
	def TCRETURNmi64 : PseudoI<(outs),
	(ins i64mem_TC:$dst, i32imm:$offset),
	[]>, Sched<[WriteJumpLd]>, NotMemoryFoldable;

	def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_brtarget:$dst),
	[]>, Sched<[WriteJump]>;

	def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
	[]>, Sched<[WriteJump]>;

	let mayLoad = 1 in
	def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst),
	[]>, Sched<[WriteJumpLd]>;

	// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
	let hasREX_WPrefix = 1 in {
	def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
	[]>, Sched<[WriteJump]>;

	let mayLoad = 1 in
	def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst),
	[]>, Sched<[WriteJumpLd]>;
	}
	}

	let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
	Uses = [RSP, SSP],
	usesCustomInserter = 1,
	SchedRW = [WriteJump] in {
	- def RETPOLINE_CALL32 :
	+ def INDIRECT_THUNK_CALL32 :
	PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
	- Requires<[Not64BitMode,UseRetpolineIndirectCalls]>;
	+ Requires<[Not64BitMode,UseIndirectThunkCalls]>;

	- def RETPOLINE_CALL64 :
	+ def INDIRECT_THUNK_CALL64 :
	PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
	- Requires<[In64BitMode,UseRetpolineIndirectCalls]>;
	+ Requires<[In64BitMode,UseIndirectThunkCalls]>;

	- // Retpoline variant of indirect tail calls.
	+ // Indirect thunk variant of indirect tail calls.
	let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
	- def RETPOLINE_TCRETURN64 :
	+ def INDIRECT_THUNK_TCRETURN64 :
	PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
	- def RETPOLINE_TCRETURN32 :
	+ def INDIRECT_THUNK_TCRETURN32 :
	PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
	}
	}

	// Conditional tail calls are similar to the above, but they are branches
	// rather than barriers, and they use EFLAGS.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
	isCodeGenOnly = 1, SchedRW = [WriteJump] in
	let Uses = [RSP, EFLAGS, SSP] in {
	def TCRETURNdi64cc : PseudoI<(outs),
	(ins i64i32imm_brtarget:$dst, i32imm:$offset,
	i32imm:$cond), []>;

	// This gets substituted to a conditional jump instruction in MC lowering.
	def TAILJMPd64_CC : PseudoI<(outs),
	(ins i64i32imm_brtarget:$dst, i32imm:$cond), []>;
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td (revision 362609)
	@@ -1,3583 +1,3583 @@
	//===-- X86InstrInfo.td - Main X86 Instruction Definition --- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 instruction set, defining the instructions, and
	// properties of the instructions which are needed for code generation, machine
	// code emission, and analysis.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// X86 specific DAG Nodes.
	//

	def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;

	def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
	//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;

	def SDTX86Cmov : SDTypeProfile<1, 4,
	[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
	SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;

	// Unary and binary operator instructions that set EFLAGS as a side-effect.
	def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
	[SDTCisSameAs<0, 2>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
	def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>,
	SDTCisVT<1, i32>,
	SDTCisVT<4, i32>]>;
	// RES1, RES2, FLAGS = op LHS, RHS
	def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
	[SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;
	def SDTX86BrCond : SDTypeProfile<0, 3,
	[SDTCisVT<0, OtherVT>,
	SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;

	def SDTX86SetCC : SDTypeProfile<1, 2,
	[SDTCisVT<0, i8>,
	SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
	def SDTX86SetCC_C : SDTypeProfile<1, 2,
	[SDTCisInt<0>,
	SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;

	def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;

	def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;

	def SDTX86rdpkru : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
	def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>]>;

	def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
	SDTCisVT<2, i8>]>;
	def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
	def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
	[SDTCisVT<0, i32>, SDTCisPtrTy<1>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
	def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
	[SDTCisVT<0, i64>, SDTCisPtrTy<1>,
	SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;

	def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
	SDTCisPtrTy<1>,
	SDTCisInt<2>]>;

	def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
	SDTCisPtrTy<1>]>;

	def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;

	def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>]>;
	def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>]>;

	def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;

	def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;

	def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
	SDTCisVT<1, iPTR>,
	SDTCisVT<2, iPTR>]>;

	def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
	SDTCisPtrTy<1>,
	SDTCisVT<2, i32>,
	SDTCisVT<3, i8>,
	SDTCisVT<4, i32>]>;

	def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;

	def SDTX86Void : SDTypeProfile<0, 0, []>;

	def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;

	def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;

	def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;

	def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;

	def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;

	def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
	SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;

	def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
	[SDNPHasChain,SDNPSideEffect]>;
	def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
	[SDNPHasChain]>;


	def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
	def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
	def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
	def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;

	def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
	def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86CmpTest, [SDNPHasChain]>;
	def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86CmpTest, [SDNPHasChain]>;
	def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;

	def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
	def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
	[SDNPHasChain]>;
	def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
	def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;

	def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;

	def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86rdpkru : SDNode<"X86ISD::RDPKRU", SDTX86rdpkru,
	[SDNPHasChain, SDNPSideEffect]>;
	def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad, SDNPMemOperand]>;
	def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad, SDNPMemOperand]>;
	def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad, SDNPMemOperand]>;
	def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
	SDTX86caspairSaveEbx8,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
	SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
	SDTX86caspairSaveRbx16,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
	SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;

	def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
	[SDNPHasChain, SDNPOptInGlue]>;

	def X86vastart_save_xmm_regs :
	SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
	SDT_X86VASTART_SAVE_XMM_REGS,
	[SDNPHasChain, SDNPVariadic]>;
	def X86vaarg64 :
	SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
	[SDNPHasChain, SDNPMayLoad, SDNPMayStore,
	SDNPMemOperand]>;
	def X86callseq_start :
	SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
	[SDNPHasChain, SDNPOutGlue]>;
	def X86callseq_end :
	SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
	[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
	SDNPVariadic]>;

	def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
	[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
	SDNPVariadic]>;
	def X86NoTrackBrind : SDNode<"X86ISD::NT_BRIND", SDT_X86NtBrind,
	[SDNPHasChain]>;

	def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
	def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad]>;

	def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
	def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;

	def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
	SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
	SDTCisInt<1>]>>;

	def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
	[SDNPHasChain]>;

	def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP",
	SDTypeProfile<1, 1, [SDTCisInt<0>,
	SDTCisPtrTy<1>]>,
	[SDNPHasChain, SDNPSideEffect]>;
	def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
	SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPSideEffect]>;
	def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH",
	SDTypeProfile<0, 0, []>,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
	def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>;
	def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>;

	def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;

	def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;

	def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;

	def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>;

	def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;

	def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
	[SDNPHasChain, SDNPOutGlue]>;

	def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
	[SDNPHasChain]>;

	def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86lwpins : SDNode<"X86ISD::LWPINS",
	SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>;

	def X86umwait : SDNode<"X86ISD::UMWAIT",
	SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86tpause : SDNode<"X86ISD::TPAUSE",
	SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
	[SDNPHasChain, SDNPSideEffect]>;
	def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
	[SDNPHasChain, SDNPSideEffect]>;

	//===----------------------------------------------------------------------===//
	// X86 Operand Definitions.
	//

	// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
	// the index operand of an address, to conform to x86 encoding restrictions.
	def ptr_rc_nosp : PointerLikeRegClass<1>;

	// *mem - Operand definitions for the funky X86 addressing mode operands.
	//
	def X86MemAsmOperand : AsmOperandClass {
	let Name = "Mem";
	}
	let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
	def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; }
	def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; }
	def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; }
	def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; }
	def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; }
	def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
	def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
	def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
	// Gather mem operands
	def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; }
	def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
	def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
	def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
	def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }

	def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; }
	def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
	def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
	def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
	def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
	def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
	def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; }
	def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; }
	}

	def X86AbsMemAsmOperand : AsmOperandClass {
	let Name = "AbsMem";
	let SuperClasses = [X86MemAsmOperand];
	}

	class X86MemOperand<string printMethod,
	AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
	let PrintMethod = printMethod;
	let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
	let ParserMatchClass = parserMatchClass;
	let OperandType = "OPERAND_MEMORY";
	}

	// Gather mem operands
	class X86VMemOperand<RegisterClass RC, string printMethod,
	AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
	}

	def anymem : X86MemOperand<"printanymem">;
	def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
	[(X86strict_fcmp node:$lhs, node:$rhs),
	(X86cmp node:$lhs, node:$rhs)]>;

	// FIXME: Right now we allow any size during parsing, but we might want to
	// restrict to only unsized memory.
	def opaquemem : X86MemOperand<"printopaquemem">;

	def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>;
	def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
	def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
	def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
	def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
	def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
	def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
	def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
	def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
	def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>;
	def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
	def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
	def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;

	// Gather mem operands
	def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand>;
	def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand>;
	def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand>;
	def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand>;
	def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand>;

	def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand>;
	def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>;
	def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>;
	def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>;
	def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>;
	def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>;
	def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand>;
	def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand>;

	// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
	// of a plain GPR, so that it doesn't potentially require a REX prefix.
	def ptr_rc_norex : PointerLikeRegClass<2>;
	def ptr_rc_norex_nosp : PointerLikeRegClass<3>;

	def i8mem_NOREX : Operand<iPTR> {
	let PrintMethod = "printbytemem";
	let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
	SEGMENT_REG);
	let ParserMatchClass = X86Mem8AsmOperand;
	let OperandType = "OPERAND_MEMORY";
	}

	// GPRs available for tailcall.
	// It represents GR32_TC, GR64_TC or GR64_TCW64.
	def ptr_rc_tailcall : PointerLikeRegClass<4>;

	// Special i32mem for addresses of load folding tail calls. These are not
	// allowed to use callee-saved registers since they must be scheduled
	// after callee-saved register are popped.
	def i32mem_TC : Operand<i32> {
	let PrintMethod = "printdwordmem";
	let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
	i32imm, SEGMENT_REG);
	let ParserMatchClass = X86Mem32AsmOperand;
	let OperandType = "OPERAND_MEMORY";
	}

	// Special i64mem for addresses of load folding tail calls. These are not
	// allowed to use callee-saved registers since they must be scheduled
	// after callee-saved register are popped.
	def i64mem_TC : Operand<i64> {
	let PrintMethod = "printqwordmem";
	let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
	ptr_rc_tailcall, i32imm, SEGMENT_REG);
	let ParserMatchClass = X86Mem64AsmOperand;
	let OperandType = "OPERAND_MEMORY";
	}

	// Special parser to detect 16-bit mode to select 16-bit displacement.
	def X86AbsMem16AsmOperand : AsmOperandClass {
	let Name = "AbsMem16";
	let RenderMethod = "addAbsMemOperands";
	let SuperClasses = [X86AbsMemAsmOperand];
	}

	// Branch targets print as pc-relative values.
	class BranchTargetOperand<ValueType ty> : Operand<ty> {
	let OperandType = "OPERAND_PCREL";
	let PrintMethod = "printPCRelImm";
	let ParserMatchClass = X86AbsMemAsmOperand;
	}

	def i32imm_brtarget : BranchTargetOperand<i32>;
	def i16imm_brtarget : BranchTargetOperand<i16>;

	// 64-bits but only 32 bits are significant, and those bits are treated as being
	// pc relative.
	def i64i32imm_brtarget : BranchTargetOperand<i64>;

	def brtarget : BranchTargetOperand<OtherVT>;
	def brtarget8 : BranchTargetOperand<OtherVT>;
	def brtarget16 : BranchTargetOperand<OtherVT> {
	let ParserMatchClass = X86AbsMem16AsmOperand;
	}
	def brtarget32 : BranchTargetOperand<OtherVT>;

	let RenderMethod = "addSrcIdxOperands" in {
	def X86SrcIdx8Operand : AsmOperandClass {
	let Name = "SrcIdx8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86SrcIdx16Operand : AsmOperandClass {
	let Name = "SrcIdx16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86SrcIdx32Operand : AsmOperandClass {
	let Name = "SrcIdx32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86SrcIdx64Operand : AsmOperandClass {
	let Name = "SrcIdx64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	} // RenderMethod = "addSrcIdxOperands"

	let RenderMethod = "addDstIdxOperands" in {
	def X86DstIdx8Operand : AsmOperandClass {
	let Name = "DstIdx8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86DstIdx16Operand : AsmOperandClass {
	let Name = "DstIdx16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86DstIdx32Operand : AsmOperandClass {
	let Name = "DstIdx32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86DstIdx64Operand : AsmOperandClass {
	let Name = "DstIdx64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	} // RenderMethod = "addDstIdxOperands"

	let RenderMethod = "addMemOffsOperands" in {
	def X86MemOffs16_8AsmOperand : AsmOperandClass {
	let Name = "MemOffs16_8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86MemOffs16_16AsmOperand : AsmOperandClass {
	let Name = "MemOffs16_16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86MemOffs16_32AsmOperand : AsmOperandClass {
	let Name = "MemOffs16_32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86MemOffs32_8AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86MemOffs32_16AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86MemOffs32_32AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86MemOffs32_64AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	def X86MemOffs64_8AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86MemOffs64_16AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86MemOffs64_32AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86MemOffs64_64AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	} // RenderMethod = "addMemOffsOperands"

	class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops ptr_rc, SEGMENT_REG);
	}

	class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops ptr_rc);
	}

	def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>;
	def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
	def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
	def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
	def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>;
	def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
	def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
	def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;

	class X86MemOffsOperand<Operand immOperand, string printMethod,
	AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops immOperand, SEGMENT_REG);
	}

	def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8",
	X86MemOffs16_8AsmOperand>;
	def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
	X86MemOffs16_16AsmOperand>;
	def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
	X86MemOffs16_32AsmOperand>;
	def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8",
	X86MemOffs32_8AsmOperand>;
	def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
	X86MemOffs32_16AsmOperand>;
	def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
	X86MemOffs32_32AsmOperand>;
	def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
	X86MemOffs32_64AsmOperand>;
	def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8",
	X86MemOffs64_8AsmOperand>;
	def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
	X86MemOffs64_16AsmOperand>;
	def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
	X86MemOffs64_32AsmOperand>;
	def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
	X86MemOffs64_64AsmOperand>;

	def ccode : Operand<i8> {
	let PrintMethod = "printCondCode";
	let OperandNamespace = "X86";
	let OperandType = "OPERAND_COND_CODE";
	}

	class ImmSExtAsmOperandClass : AsmOperandClass {
	let SuperClasses = [ImmAsmOperand];
	let RenderMethod = "addImmOperands";
	}

	def X86GR32orGR64AsmOperand : AsmOperandClass {
	let Name = "GR32orGR64";
	}

	def GR32orGR64 : RegisterOperand<GR32> {
	let ParserMatchClass = X86GR32orGR64AsmOperand;
	}
	def AVX512RCOperand : AsmOperandClass {
	let Name = "AVX512RC";
	}
	def AVX512RC : Operand<i32> {
	let PrintMethod = "printRoundingControl";
	let OperandNamespace = "X86";
	let OperandType = "OPERAND_ROUNDING_CONTROL";
	let ParserMatchClass = AVX512RCOperand;
	}

	// Sign-extended immediate classes. We don't need to define the full lattice
	// here because there is no instruction with an ambiguity between ImmSExti64i32
	// and ImmSExti32i8.
	//
	// The strange ranges come from the fact that the assembler always works with
	// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
	// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).

	// [0, 0x7FFFFFFF] \|
	// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti64i32";
	}

	// [0, 0x0000007F] \| [0x000000000000FF80, 0x000000000000FFFF] \|
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti16i8";
	let SuperClasses = [ImmSExti64i32AsmOperand];
	}

	// [0, 0x0000007F] \| [0x00000000FFFFFF80, 0x00000000FFFFFFFF] \|
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti32i8";
	}

	// [0, 0x0000007F] \|
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti64i8";
	let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
	ImmSExti64i32AsmOperand];
	}

	// 4-bit immediate used by some XOP instructions
	// [0, 0xF]
	def ImmUnsignedi4AsmOperand : AsmOperandClass {
	let Name = "ImmUnsignedi4";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidImmUnsignedi4";
	}

	// Unsigned immediate used by SSE/AVX instructions
	// [0, 0xFF]
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmUnsignedi8AsmOperand : AsmOperandClass {
	let Name = "ImmUnsignedi8";
	let RenderMethod = "addImmOperands";
	}

	// A couple of more descriptive operand definitions.
	// 16-bits but only 8 bits are significant.
	def i16i8imm : Operand<i16> {
	let ParserMatchClass = ImmSExti16i8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}
	// 32-bits but only 8 bits are significant.
	def i32i8imm : Operand<i32> {
	let ParserMatchClass = ImmSExti32i8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 64-bits but only 32 bits are significant.
	def i64i32imm : Operand<i64> {
	let ParserMatchClass = ImmSExti64i32AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 64-bits but only 8 bits are significant.
	def i64i8imm : Operand<i64> {
	let ParserMatchClass = ImmSExti64i8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// Unsigned 4-bit immediate used by some XOP instructions.
	def u4imm : Operand<i8> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi4AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// Unsigned 8-bit immediate used by SSE/AVX instructions.
	def u8imm : Operand<i8> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 16-bit immediate but only 8-bits are significant and they are unsigned.
	// Used by BT instructions.
	def i16u8imm : Operand<i16> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 32-bit immediate but only 8-bits are significant and they are unsigned.
	// Used by some SSE/AVX instructions that use intrinsics.
	def i32u8imm : Operand<i32> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 64-bit immediate but only 8-bits are significant and they are unsigned.
	// Used by BT instructions.
	def i64u8imm : Operand<i64> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	def lea64_32mem : Operand<i32> {
	let PrintMethod = "printanymem";
	let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
	let ParserMatchClass = X86MemAsmOperand;
	}

	// Memory operands that use 64-bit pointers in both ILP32 and LP64.
	def lea64mem : Operand<i64> {
	let PrintMethod = "printanymem";
	let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
	let ParserMatchClass = X86MemAsmOperand;
	}

	let RenderMethod = "addMaskPairOperands" in {
	def VK1PairAsmOperand : AsmOperandClass { let Name = "VK1Pair"; }
	def VK2PairAsmOperand : AsmOperandClass { let Name = "VK2Pair"; }
	def VK4PairAsmOperand : AsmOperandClass { let Name = "VK4Pair"; }
	def VK8PairAsmOperand : AsmOperandClass { let Name = "VK8Pair"; }
	def VK16PairAsmOperand : AsmOperandClass { let Name = "VK16Pair"; }
	}

	def VK1Pair : RegisterOperand<VK1PAIR, "printVKPair"> {
	let ParserMatchClass = VK1PairAsmOperand;
	}

	def VK2Pair : RegisterOperand<VK2PAIR, "printVKPair"> {
	let ParserMatchClass = VK2PairAsmOperand;
	}

	def VK4Pair : RegisterOperand<VK4PAIR, "printVKPair"> {
	let ParserMatchClass = VK4PairAsmOperand;
	}

	def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
	let ParserMatchClass = VK8PairAsmOperand;
	}

	def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
	let ParserMatchClass = VK16PairAsmOperand;
	}

	//===----------------------------------------------------------------------===//
	// X86 Complex Pattern Definitions.
	//

	// Define X86-specific addressing mode.
	def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
	def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
	[add, sub, mul, X86mul_imm, shl, or, frameindex],
	[]>;
	// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
	def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
	[add, sub, mul, X86mul_imm, shl, or,
	frameindex, X86WrapperRIP],
	[]>;

	def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
	[add, sub, mul, X86mul_imm, shl, or, frameindex,
	X86WrapperRIP], []>;

	def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;

	// A relocatable immediate is either an immediate operand or an operand that can
	// be relocated by the linker to an immediate, such as a regular symbol in
	// non-PIC code.
	def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
	0>;

	//===----------------------------------------------------------------------===//
	// X86 Instruction Predicate Definitions.
	def TruePredicate : Predicate<"true">;

	def HasCMov : Predicate<"Subtarget->hasCMov()">;
	def NoCMov : Predicate<"!Subtarget->hasCMov()">;

	def HasMMX : Predicate<"Subtarget->hasMMX()">;
	def Has3DNow : Predicate<"Subtarget->has3DNow()">;
	def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
	def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
	def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
	def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
	def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
	def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
	def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
	def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
	def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
	def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
	def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">;
	def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
	def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
	def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
	def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
	def NoAVX : Predicate<"!Subtarget->hasAVX()">;
	def HasAVX : Predicate<"Subtarget->hasAVX()">;
	def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
	def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
	def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
	def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
	def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
	def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
	def HasCDI : Predicate<"Subtarget->hasCDI()">;
	def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
	def HasPFI : Predicate<"Subtarget->hasPFI()">;
	def HasERI : Predicate<"Subtarget->hasERI()">;
	def HasDQI : Predicate<"Subtarget->hasDQI()">;
	def NoDQI : Predicate<"!Subtarget->hasDQI()">;
	def HasBWI : Predicate<"Subtarget->hasBWI()">;
	def NoBWI : Predicate<"!Subtarget->hasBWI()">;
	def HasVLX : Predicate<"Subtarget->hasVLX()">;
	def NoVLX : Predicate<"!Subtarget->hasVLX()">;
	def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasBWI()">;
	def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasDQI()">;
	def PKU : Predicate<"Subtarget->hasPKU()">;
	def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
	def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
	def HasBF16 : Predicate<"Subtarget->hasBF16()">;

	def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
	def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
	def HasAES : Predicate<"Subtarget->hasAES()">;
	def HasVAES : Predicate<"Subtarget->hasVAES()">;
	def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasVAES()">;
	def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
	def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
	def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
	def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
	def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
	def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
	def NoVLX_Or_NoVPCLMULQDQ :
	Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasVPCLMULQDQ()">;
	def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;
	def HasGFNI : Predicate<"Subtarget->hasGFNI()">;
	def HasFMA : Predicate<"Subtarget->hasFMA()">;
	def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
	def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">;
	def HasXOP : Predicate<"Subtarget->hasXOP()">;
	def HasTBM : Predicate<"Subtarget->hasTBM()">;
	def NoTBM : Predicate<"!Subtarget->hasTBM()">;
	def HasLWP : Predicate<"Subtarget->hasLWP()">;
	def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
	def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
	def HasF16C : Predicate<"Subtarget->hasF16C()">;
	def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
	def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
	def HasBMI : Predicate<"Subtarget->hasBMI()">;
	def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
	def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">;
	def HasVBMI : Predicate<"Subtarget->hasVBMI()">;
	def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">;
	def HasIFMA : Predicate<"Subtarget->hasIFMA()">;
	def HasRTM : Predicate<"Subtarget->hasRTM()">;
	def HasADX : Predicate<"Subtarget->hasADX()">;
	def HasSHA : Predicate<"Subtarget->hasSHA()">;
	def HasSGX : Predicate<"Subtarget->hasSGX()">;
	def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
	def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
	def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
	def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
	def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
	def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
	def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
	def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">;
	def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">;
	def HasCLDEMOTE : Predicate<"Subtarget->hasCLDEMOTE()">;
	def HasMOVDIRI : Predicate<"Subtarget->hasMOVDIRI()">;
	def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
	def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">;
	def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
	def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
	def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
	def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
	def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
	def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
	def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
	def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
	def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
	def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
	def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
	def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
	def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
	def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
	AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
	def In64BitMode : Predicate<"Subtarget->is64Bit()">,
	AssemblerPredicate<"Mode64Bit", "64-bit mode">;
	def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
	def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
	def In16BitMode : Predicate<"Subtarget->is16Bit()">,
	AssemblerPredicate<"Mode16Bit", "16-bit mode">;
	def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
	AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
	def In32BitMode : Predicate<"Subtarget->is32Bit()">,
	AssemblerPredicate<"Mode32Bit", "32-bit mode">;
	def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
	def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
	def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() \|\|"
	"Subtarget->getFrameLowering()->hasFP(*MF)"> {
	let RecomputePerFunction = 1;
	}
	def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
	def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
	def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
	def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
	def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
	def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
	def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small \|\|"
	"TM.getCodeModel() == CodeModel::Kernel">;
	def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;

	// We could compute these on a per-module basis but doing so requires accessing
	// the Function object through the <Target>Subtarget and objections were raised
	// to that (see post-commit review comments for r301750).
	let RecomputePerFunction = 1 in {
	def OptForSize : Predicate<"shouldOptForSize(MF)">;
	def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">;
	def OptForSpeed : Predicate<"!shouldOptForSize(MF)">;
	def UseIncDec : Predicate<"!Subtarget->slowIncDec() \|\| "
	"shouldOptForSize(MF)">;
	def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) \|\| "
	"!Subtarget->hasSSE41()">;
	}

	def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
	def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">;
	def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
	def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
	def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
	def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
	def HasMFence : Predicate<"Subtarget->hasMFence()">;
	-def UseRetpolineIndirectCalls : Predicate<"Subtarget->useRetpolineIndirectCalls()">;
	-def NotUseRetpolineIndirectCalls : Predicate<"!Subtarget->useRetpolineIndirectCalls()">;
	+def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
	+def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;

	//===----------------------------------------------------------------------===//
	// X86 Instruction Format Definitions.
	//

	include "X86InstrFormats.td"

	//===----------------------------------------------------------------------===//
	// Pattern fragments.
	//

	// X86 specific condition code. These correspond to CondCode in
	// X86InstrInfo.h. They must be kept in synch.
	def X86_COND_O : PatLeaf<(i8 0)>;
	def X86_COND_NO : PatLeaf<(i8 1)>;
	def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
	def X86_COND_AE : PatLeaf<(i8 3)>; // alt. COND_NC
	def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
	def X86_COND_NE : PatLeaf<(i8 5)>; // alt. COND_NZ
	def X86_COND_BE : PatLeaf<(i8 6)>; // alt. COND_NA
	def X86_COND_A : PatLeaf<(i8 7)>; // alt. COND_NBE
	def X86_COND_S : PatLeaf<(i8 8)>;
	def X86_COND_NS : PatLeaf<(i8 9)>;
	def X86_COND_P : PatLeaf<(i8 10)>; // alt. COND_PE
	def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
	def X86_COND_L : PatLeaf<(i8 12)>; // alt. COND_NGE
	def X86_COND_GE : PatLeaf<(i8 13)>; // alt. COND_NL
	def X86_COND_LE : PatLeaf<(i8 14)>; // alt. COND_NG
	def X86_COND_G : PatLeaf<(i8 15)>; // alt. COND_NLE

	def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
	def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
	def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
	def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;

	// FIXME: Ideally we would just replace the above iimmSExt matchers with
	// relocImm-based matchers, but then FastISel would be unable to use them.
	def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
	return isSExtRelocImm<8>(N);
	}]>;
	def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
	return isSExtRelocImm<32>(N);
	}]>;

	// If we have multiple users of an immediate, it's much smaller to reuse
	// the register, rather than encode the immediate in every instruction.
	// This has the risk of increasing register pressure from stretched live
	// ranges, however, the immediates should be trivial to rematerialize by
	// the RA in the event of high register pressure.
	// TODO : This is currently enabled for stores and binary ops. There are more
	// cases for which this can be enabled, though this catches the bulk of the
	// issues.
	// TODO2 : This should really also be enabled under O2, but there's currently
	// an issue with RA where we don't pull the constants into their users
	// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
	// issue.
	// TODO3 : This is currently limited to single basic blocks (DAG creation
	// pulls block immediates to the top and merges them if necessary).
	// Eventually, it would be nice to allow ConstantHoisting to merge constants
	// globally for potentially added savings.
	//
	def relocImm8_su : PatLeaf<(i8 relocImm), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def relocImm16_su : PatLeaf<(i16 relocImm), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def relocImm32_su : PatLeaf<(i32 relocImm), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;

	def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;

	def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;

	// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
	// unsigned field.
	def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;

	def i64immZExt32SExt8 : ImmLeaf<i64, [{
	return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
	}]>;

	// Helper fragments for loads.

	// It's safe to fold a zextload/extload from i1 as a regular i8 load. The
	// upper bits are guaranteed to be zero and we were going to emit a MOV8rm
	// which might get folded during peephole anyway.
	def loadi8 : PatFrag<(ops node:$ptr), (i8 (unindexedload node:$ptr)), [{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	return ExtType == ISD::NON_EXTLOAD \|\| ExtType == ISD::EXTLOAD \|\|
	ExtType == ISD::ZEXTLOAD;
	}]>;

	// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
	// known to be 32-bit aligned or better. Ditto for i8 to i16.
	def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::NON_EXTLOAD)
	return true;
	if (ExtType == ISD::EXTLOAD)
	return LD->getAlignment() >= 2 && LD->isSimple();
	return false;
	}]>;

	def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::NON_EXTLOAD)
	return true;
	if (ExtType == ISD::EXTLOAD)
	return LD->getAlignment() >= 4 && LD->isSimple();
	return false;
	}]>;

	def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
	def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
	def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
	def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
	def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
	def alignedloadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
	}]>;
	def memopf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	return Subtarget->hasSSEUnalignedMem() \|\|
	Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
	}]>;

	def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
	def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
	def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
	def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
	def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
	def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;

	def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
	def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
	def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
	def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
	def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
	def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
	def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
	def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
	def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
	def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;

	def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
	def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
	def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
	def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
	def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
	def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
	def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
	def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
	def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;

	// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known
	// to be 4 byte aligned or better.
	def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType != ISD::EXTLOAD)
	return false;
	if (LD->getMemoryVT() == MVT::i32)
	return true;

	return LD->getAlignment() >= 4 && LD->isSimple();
	}]>;


	// An 'and' node with a single use.
	def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
	return N->hasOneUse();
	}]>;
	// An 'srl' node with a single use.
	def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
	return N->hasOneUse();
	}]>;
	// An 'trunc' node with a single use.
	def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
	return N->hasOneUse();
	}]>;

	//===----------------------------------------------------------------------===//
	// Instruction list.
	//

	// Nop
	let hasSideEffects = 0, SchedRW = [WriteNop] in {
	def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
	def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
	"nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
	def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
	"nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
	def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero),
	"nop{q}\t$zero", []>, TB, NotMemoryFoldable,
	Requires<[In64BitMode]>;
	// Also allow register so we can assemble/disassemble
	def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero),
	"nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
	def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero),
	"nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
	def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero),
	"nop{q}\t$zero", []>, TB, NotMemoryFoldable,
	Requires<[In64BitMode]>;
	}


	// Constructing a stack frame.
	def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
	"enter\t$len, $lvl", []>, Sched<[WriteMicrocoded]>;

	let SchedRW = [WriteALU] in {
	let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
	def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
	Requires<[Not64BitMode]>;

	let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
	def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
	Requires<[In64BitMode]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Miscellaneous Instructions.
	//

	let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1,
	SchedRW = [WriteSystem] in
	def Int_eh_sjlj_setup_dispatch
	: PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;

	let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
	let mayLoad = 1, SchedRW = [WriteLoad] in {
	def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
	OpSize16;
	def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
	OpSize32, Requires<[Not64BitMode]>;
	// Long form for the disassembler.
	let isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
	OpSize16, NotMemoryFoldable;
	def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
	OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
	} // isCodeGenOnly = 1, ForceDisassemble = 1
	} // mayLoad, SchedRW
	let mayStore = 1, mayLoad = 1, SchedRW = [WriteCopy] in {
	def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", []>,
	OpSize16;
	def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", []>,
	OpSize32, Requires<[Not64BitMode]>;
	} // mayStore, mayLoad, SchedRW

	let mayStore = 1, SchedRW = [WriteStore] in {
	def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
	OpSize16;
	def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
	OpSize32, Requires<[Not64BitMode]>;
	// Long form for the disassembler.
	let isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
	OpSize16, NotMemoryFoldable;
	def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
	OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
	} // isCodeGenOnly = 1, ForceDisassemble = 1

	def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
	"push{w}\t$imm", []>, OpSize16;
	def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
	"push{w}\t$imm", []>, OpSize16;

	def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
	"push{l}\t$imm", []>, OpSize32,
	Requires<[Not64BitMode]>;
	def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
	"push{l}\t$imm", []>, OpSize32,
	Requires<[Not64BitMode]>;
	} // mayStore, SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
	def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src", []>,
	OpSize16;
	def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src", []>,
	OpSize32, Requires<[Not64BitMode]>;
	} // mayLoad, mayStore, SchedRW

	}

	let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
	SchedRW = [WriteRMW], Defs = [ESP] in {
	let Uses = [ESP] in
	def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
	[(set GR32:$dst, (int_x86_flags_read_u32))]>,
	Requires<[Not64BitMode]>;

	let Uses = [RSP] in
	def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
	[(set GR64:$dst, (int_x86_flags_read_u64))]>,
	Requires<[In64BitMode]>;
	}

	let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
	SchedRW = [WriteRMW] in {
	let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in
	def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
	[(int_x86_flags_write_u32 GR32:$src)]>,
	Requires<[Not64BitMode]>;

	let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in
	def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
	[(int_x86_flags_write_u64 GR64:$src)]>,
	Requires<[In64BitMode]>;
	}

	let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
	SchedRW = [WriteLoad] in {
	def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize16;
	def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l\|d}", []>, OpSize32,
	Requires<[Not64BitMode]>;
	}

	let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0,
	SchedRW = [WriteStore] in {
	def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize16;
	def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l\|d}", []>, OpSize32,
	Requires<[Not64BitMode]>;
	}

	let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
	let mayLoad = 1, SchedRW = [WriteLoad] in {
	def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
	OpSize32, Requires<[In64BitMode]>;
	// Long form for the disassembler.
	let isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
	OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
	} // isCodeGenOnly = 1, ForceDisassemble = 1
	} // mayLoad, SchedRW
	let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
	def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", []>,
	OpSize32, Requires<[In64BitMode]>;
	let mayStore = 1, SchedRW = [WriteStore] in {
	def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
	OpSize32, Requires<[In64BitMode]>;
	// Long form for the disassembler.
	let isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
	OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
	} // isCodeGenOnly = 1, ForceDisassemble = 1
	} // mayStore, SchedRW
	let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
	def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
	OpSize32, Requires<[In64BitMode]>;
	} // mayLoad, mayStore, SchedRW
	}

	let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
	SchedRW = [WriteStore] in {
	def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
	"push{q}\t$imm", []>, OpSize32,
	Requires<[In64BitMode]>;
	def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
	"push{q}\t$imm", []>, OpSize32,
	Requires<[In64BitMode]>;
	}

	let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
	def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
	OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
	let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in
	def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
	OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;

	let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
	mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
	def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", []>,
	OpSize32, Requires<[Not64BitMode]>;
	def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", []>,
	OpSize16, Requires<[Not64BitMode]>;
	}
	let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
	mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
	def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", []>,
	OpSize32, Requires<[Not64BitMode]>;
	def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", []>,
	OpSize16, Requires<[Not64BitMode]>;
	}

	let Constraints = "$src = $dst", SchedRW = [WriteBSWAP32] in {
	// This instruction is a consequence of BSWAP32r observing operand size. The
	// encoding is valid, but the behavior is undefined.
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
	def BSWAP16r_BAD : I<0xC8, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
	"bswap{w}\t$dst", []>, OpSize16, TB;
	// GR32 = bswap GR32
	def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
	"bswap{l}\t$dst",
	[(set GR32:$dst, (bswap GR32:$src))]>, OpSize32, TB;

	let SchedRW = [WriteBSWAP64] in
	def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
	"bswap{q}\t$dst",
	[(set GR64:$dst, (bswap GR64:$src))]>, TB;
	} // Constraints = "$src = $dst", SchedRW

	// Bit scan instructions.
	let Defs = [EFLAGS] in {
	def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"bsf{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
	PS, OpSize16, Sched<[WriteBSF]>;
	def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"bsf{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
	PS, OpSize16, Sched<[WriteBSFLd]>;
	def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"bsf{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
	PS, OpSize32, Sched<[WriteBSF]>;
	def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"bsf{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
	PS, OpSize32, Sched<[WriteBSFLd]>;
	def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"bsf{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
	PS, Sched<[WriteBSF]>;
	def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"bsf{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
	PS, Sched<[WriteBSFLd]>;

	def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"bsr{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
	PS, OpSize16, Sched<[WriteBSR]>;
	def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"bsr{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
	PS, OpSize16, Sched<[WriteBSRLd]>;
	def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"bsr{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
	PS, OpSize32, Sched<[WriteBSR]>;
	def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"bsr{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
	PS, OpSize32, Sched<[WriteBSRLd]>;
	def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"bsr{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
	PS, Sched<[WriteBSR]>;
	def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"bsr{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
	PS, Sched<[WriteBSRLd]>;
	} // Defs = [EFLAGS]

	let SchedRW = [WriteMicrocoded] in {
	let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in {
	def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
	"movsb\t{$src, $dst\|$dst, $src}", []>;
	def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
	"movsw\t{$src, $dst\|$dst, $src}", []>, OpSize16;
	def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
	"movs{l\|d}\t{$src, $dst\|$dst, $src}", []>, OpSize32;
	def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
	"movsq\t{$src, $dst\|$dst, $src}", []>,
	Requires<[In64BitMode]>;
	}

	let Defs = [EDI], Uses = [AL,EDI,DF] in
	def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
	"stosb\t{%al, $dst\|$dst, al}", []>;
	let Defs = [EDI], Uses = [AX,EDI,DF] in
	def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
	"stosw\t{%ax, $dst\|$dst, ax}", []>, OpSize16;
	let Defs = [EDI], Uses = [EAX,EDI,DF] in
	def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
	"stos{l\|d}\t{%eax, $dst\|$dst, eax}", []>, OpSize32;
	let Defs = [RDI], Uses = [RAX,RDI,DF] in
	def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
	"stosq\t{%rax, $dst\|$dst, rax}", []>,
	Requires<[In64BitMode]>;

	let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in
	def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
	"scasb\t{$dst, %al\|al, $dst}", []>;
	let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in
	def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
	"scasw\t{$dst, %ax\|ax, $dst}", []>, OpSize16;
	let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in
	def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
	"scas{l\|d}\t{$dst, %eax\|eax, $dst}", []>, OpSize32;
	let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in
	def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
	"scasq\t{$dst, %rax\|rax, $dst}", []>,
	Requires<[In64BitMode]>;

	let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in {
	def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
	"cmpsb\t{$dst, $src\|$src, $dst}", []>;
	def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
	"cmpsw\t{$dst, $src\|$src, $dst}", []>, OpSize16;
	def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
	"cmps{l\|d}\t{$dst, $src\|$src, $dst}", []>, OpSize32;
	def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
	"cmpsq\t{$dst, $src\|$src, $dst}", []>,
	Requires<[In64BitMode]>;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Move Instructions.
	//
	let SchedRW = [WriteMove] in {
	let hasSideEffects = 0, isMoveReg = 1 in {
	def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", []>;
	def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}", []>, OpSize16;
	def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}", []>, OpSize32;
	def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}", []>;
	}

	let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
	def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(set GR8:$dst, imm:$src)]>;
	def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, imm:$src)]>, OpSize16;
	def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, relocImm:$src)]>, OpSize32;
	def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, i64immSExt32:$src)]>;
	}
	let isReMaterializable = 1, isMoveImm = 1 in {
	def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
	"movabs{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, relocImm:$src)]>;
	}

	// Longer forms that use a ModR/M byte. Needed for disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
	def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", []>,
	FoldGenData<"MOV8ri">;
	def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}", []>, OpSize16,
	FoldGenData<"MOV16ri">;
	def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}", []>, OpSize32,
	FoldGenData<"MOV32ri">;
	}
	} // SchedRW

	let SchedRW = [WriteStore] in {
	def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(store (i8 relocImm8_su:$src), addr:$dst)]>;
	def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16;
	def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32;
	def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(store i64relocImmSExt32_su:$src, addr:$dst)]>,
	Requires<[In64BitMode]>;
	} // SchedRW

	let hasSideEffects = 0 in {

	/// Memory offset versions of moves. The immediate is an address mode sized
	/// offset from the segment base.
	let SchedRW = [WriteALU] in {
	let mayLoad = 1 in {
	let Defs = [AL] in
	def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
	"mov{b}\t{$src, %al\|al, $src}", []>,
	AdSize32;
	let Defs = [AX] in
	def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
	"mov{w}\t{$src, %ax\|ax, $src}", []>,
	OpSize16, AdSize32;
	let Defs = [EAX] in
	def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
	"mov{l}\t{$src, %eax\|eax, $src}", []>,
	OpSize32, AdSize32;
	let Defs = [RAX] in
	def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
	"mov{q}\t{$src, %rax\|rax, $src}", []>,
	AdSize32;

	let Defs = [AL] in
	def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
	"mov{b}\t{$src, %al\|al, $src}", []>, AdSize16;
	let Defs = [AX] in
	def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
	"mov{w}\t{$src, %ax\|ax, $src}", []>,
	OpSize16, AdSize16;
	let Defs = [EAX] in
	def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
	"mov{l}\t{$src, %eax\|eax, $src}", []>,
	AdSize16, OpSize32;
	} // mayLoad
	let mayStore = 1 in {
	let Uses = [AL] in
	def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
	"mov{b}\t{%al, $dst\|$dst, al}", []>, AdSize32;
	let Uses = [AX] in
	def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
	"mov{w}\t{%ax, $dst\|$dst, ax}", []>,
	OpSize16, AdSize32;
	let Uses = [EAX] in
	def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
	"mov{l}\t{%eax, $dst\|$dst, eax}", []>,
	OpSize32, AdSize32;
	let Uses = [RAX] in
	def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
	"mov{q}\t{%rax, $dst\|$dst, rax}", []>,
	AdSize32;

	let Uses = [AL] in
	def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
	"mov{b}\t{%al, $dst\|$dst, al}", []>, AdSize16;
	let Uses = [AX] in
	def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
	"mov{w}\t{%ax, $dst\|$dst, ax}", []>,
	OpSize16, AdSize16;
	let Uses = [EAX] in
	def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
	"mov{l}\t{%eax, $dst\|$dst, eax}", []>,
	OpSize32, AdSize16;
	} // mayStore

	// These forms all have full 64-bit absolute addresses in their instructions
	// and use the movabs mnemonic to indicate this specific form.
	let mayLoad = 1 in {
	let Defs = [AL] in
	def MOV8ao64 : Ii64<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
	"movabs{b}\t{$src, %al\|al, $src}", []>,
	AdSize64;
	let Defs = [AX] in
	def MOV16ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
	"movabs{w}\t{$src, %ax\|ax, $src}", []>,
	OpSize16, AdSize64;
	let Defs = [EAX] in
	def MOV32ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
	"movabs{l}\t{$src, %eax\|eax, $src}", []>,
	OpSize32, AdSize64;
	let Defs = [RAX] in
	def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
	"movabs{q}\t{$src, %rax\|rax, $src}", []>,
	AdSize64;
	} // mayLoad

	let mayStore = 1 in {
	let Uses = [AL] in
	def MOV8o64a : Ii64<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
	"movabs{b}\t{%al, $dst\|$dst, al}", []>,
	AdSize64;
	let Uses = [AX] in
	def MOV16o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
	"movabs{w}\t{%ax, $dst\|$dst, ax}", []>,
	OpSize16, AdSize64;
	let Uses = [EAX] in
	def MOV32o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
	"movabs{l}\t{%eax, $dst\|$dst, eax}", []>,
	OpSize32, AdSize64;
	let Uses = [RAX] in
	def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
	"movabs{q}\t{%rax, $dst\|$dst, rax}", []>,
	AdSize64;
	} // mayStore
	} // SchedRW
	} // hasSideEffects = 0

	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
	SchedRW = [WriteMove], isMoveReg = 1 in {
	def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", []>,
	FoldGenData<"MOV8rr">;
	def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}", []>, OpSize16,
	FoldGenData<"MOV16rr">;
	def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}", []>, OpSize32,
	FoldGenData<"MOV32rr">;
	def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}", []>,
	FoldGenData<"MOV64rr">;
	}

	// Reversed version with ".s" suffix for GAS compatibility.
	def : InstAlias<"mov{b}.s\t{$src, $dst\|$dst, $src}",
	(MOV8rr_REV GR8:$dst, GR8:$src), 0>;
	def : InstAlias<"mov{w}.s\t{$src, $dst\|$dst, $src}",
	(MOV16rr_REV GR16:$dst, GR16:$src), 0>;
	def : InstAlias<"mov{l}.s\t{$src, $dst\|$dst, $src}",
	(MOV32rr_REV GR32:$dst, GR32:$src), 0>;
	def : InstAlias<"mov{q}.s\t{$src, $dst\|$dst, $src}",
	(MOV64rr_REV GR64:$dst, GR64:$src), 0>;
	def : InstAlias<"mov.s\t{$src, $dst\|$dst, $src}",
	(MOV8rr_REV GR8:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"mov.s\t{$src, $dst\|$dst, $src}",
	(MOV16rr_REV GR16:$dst, GR16:$src), 0, "att">;
	def : InstAlias<"mov.s\t{$src, $dst\|$dst, $src}",
	(MOV32rr_REV GR32:$dst, GR32:$src), 0, "att">;
	def : InstAlias<"mov.s\t{$src, $dst\|$dst, $src}",
	(MOV64rr_REV GR64:$dst, GR64:$src), 0, "att">;

	let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
	def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(set GR8:$dst, (loadi8 addr:$src))]>;
	def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (loadi16 addr:$src))]>, OpSize16;
	def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (loadi32 addr:$src))]>, OpSize32;
	def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (load addr:$src))]>;
	}

	let SchedRW = [WriteStore] in {
	def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(store GR8:$src, addr:$dst)]>;
	def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(store GR16:$src, addr:$dst)]>, OpSize16;
	def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(store GR32:$src, addr:$dst)]>, OpSize32;
	def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(store GR64:$src, addr:$dst)]>;
	} // SchedRW

	// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
	// that they can be used for copying and storing h registers, which can't be
	// encoded when a REX prefix is present.
	let isCodeGenOnly = 1 in {
	let hasSideEffects = 0, isMoveReg = 1 in
	def MOV8rr_NOREX : I<0x88, MRMDestReg,
	(outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", []>,
	Sched<[WriteMove]>;
	let mayStore = 1, hasSideEffects = 0 in
	def MOV8mr_NOREX : I<0x88, MRMDestMem,
	(outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", []>,
	Sched<[WriteStore]>;
	let mayLoad = 1, hasSideEffects = 0,
	canFoldAsLoad = 1, isReMaterializable = 1 in
	def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
	(outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", []>,
	Sched<[WriteLoad]>;
	}


	// Condition code ops, incl. set if equal/not equal/...
	let SchedRW = [WriteLAHFSAHF] in {
	let Defs = [EFLAGS], Uses = [AH] in
	def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
	[(set EFLAGS, (X86sahf AH))]>,
	Requires<[HasLAHFSAHF]>;
	let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
	def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags
	Requires<[HasLAHFSAHF]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Bit tests instructions: BT, BTS, BTR, BTC.

	let Defs = [EFLAGS] in {
	let SchedRW = [WriteBitTest] in {
	def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>,
	OpSize16, TB, NotMemoryFoldable;
	def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>,
	OpSize32, TB, NotMemoryFoldable;
	def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB,
	NotMemoryFoldable;
	} // SchedRW

	// Unlike with the register+register form, the memory+register form of the
	// bt instruction does not ignore the high bits of the index. From ISel's
	// perspective, this is pretty bizarre. Make these instructions disassembly
	// only for now. These instructions are also slow on modern CPUs so that's
	// another reason to avoid generating them.

	let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteBitTestRegLd] in {
	def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[]>, OpSize16, TB, NotMemoryFoldable;
	def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[]>, OpSize32, TB, NotMemoryFoldable;
	def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[]>, TB, NotMemoryFoldable;
	}

	let SchedRW = [WriteBitTest] in {
	def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16u8imm:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR16:$src1, imm:$src2))]>,
	OpSize16, TB;
	def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32u8imm:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR32:$src1, imm:$src2))]>,
	OpSize32, TB;
	def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64u8imm:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR64:$src1, imm:$src2))]>, TB;
	} // SchedRW

	// Note that these instructions aren't slow because that only applies when the
	// other operand is in a register. When it's an immediate, bt is still fast.
	let SchedRW = [WriteBitTestImmLd] in {
	def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt (loadi16 addr:$src1),
	imm:$src2))]>,
	OpSize16, TB;
	def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt (loadi32 addr:$src1),
	imm:$src2))]>,
	OpSize32, TB;
	def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt (loadi64 addr:$src1),
	imm:$src2))]>, TB,
	Requires<[In64BitMode]>;
	} // SchedRW

	let hasSideEffects = 0 in {
	let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
	def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB, NotMemoryFoldable;
	def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB, NotMemoryFoldable;
	def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	NotMemoryFoldable;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetRegRMW] in {
	def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB, NotMemoryFoldable;
	def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB, NotMemoryFoldable;
	def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	NotMemoryFoldable;
	}

	let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
	def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", []>, OpSize16, TB;
	def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", []>, OpSize32, TB;
	def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", []>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
	def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", []>, OpSize16, TB;
	def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", []>, OpSize32, TB;
	def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	Requires<[In64BitMode]>;
	}

	let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
	def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB, NotMemoryFoldable;
	def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB, NotMemoryFoldable;
	def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	NotMemoryFoldable;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetRegRMW] in {
	def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB, NotMemoryFoldable;
	def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB, NotMemoryFoldable;
	def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	NotMemoryFoldable;
	}

	let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
	def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB;
	def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB;
	def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", []>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
	def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB;
	def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB;
	def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	Requires<[In64BitMode]>;
	}

	let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
	def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB, NotMemoryFoldable;
	def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB, NotMemoryFoldable;
	def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	NotMemoryFoldable;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetRegRMW] in {
	def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, TB, NotMemoryFoldable;
	def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, TB, NotMemoryFoldable;
	def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	NotMemoryFoldable;
	}

	let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
	def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", []>, OpSize16, TB;
	def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", []>, OpSize32, TB;
	def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", []>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
	def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", []>, OpSize16, TB;
	def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", []>, OpSize32, TB;
	def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", []>, TB,
	Requires<[In64BitMode]>;
	}
	} // hasSideEffects = 0
	} // Defs = [EFLAGS]


	//===----------------------------------------------------------------------===//
	// Atomic support
	//

	// Atomic swap. These are just normal xchg instructions. But since a memory
	// operand is referenced, the atomicity is ensured.
	multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> {
	let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
	def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
	(ins GR8:$val, i8mem:$ptr),
	!strconcat(mnemonic, "{b}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR8:$dst,
	(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
	def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
	(ins GR16:$val, i16mem:$ptr),
	!strconcat(mnemonic, "{w}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR16:$dst,
	(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
	OpSize16;
	def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
	(ins GR32:$val, i32mem:$ptr),
	!strconcat(mnemonic, "{l}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR32:$dst,
	(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
	OpSize32;
	def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
	(ins GR64:$val, i64mem:$ptr),
	!strconcat(mnemonic, "{q}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR64:$dst,
	(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
	}
	}

	defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap">, NotMemoryFoldable;

	// Swap between registers.
	let SchedRW = [WriteXCHG] in {
	let Constraints = "$src1 = $dst1, $src2 = $dst2", hasSideEffects = 0 in {
	def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst1, GR8:$dst2),
	(ins GR8:$src1, GR8:$src2),
	"xchg{b}\t{$src2, $src1\|$src1, $src2}", []>, NotMemoryFoldable;
	def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst1, GR16:$dst2),
	(ins GR16:$src1, GR16:$src2),
	"xchg{w}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize16, NotMemoryFoldable;
	def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst1, GR32:$dst2),
	(ins GR32:$src1, GR32:$src2),
	"xchg{l}\t{$src2, $src1\|$src1, $src2}", []>,
	OpSize32, NotMemoryFoldable;
	def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst1, GR64:$dst2),
	(ins GR64:$src1 ,GR64:$src2),
	"xchg{q}\t{$src2, $src1\|$src1, $src2}", []>, NotMemoryFoldable;
	}

	// Swap between EAX and other registers.
	let Constraints = "$src = $dst", hasSideEffects = 0 in {
	let Uses = [AX], Defs = [AX] in
	def XCHG16ar : I<0x90, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
	"xchg{w}\t{$src, %ax\|ax, $src}", []>, OpSize16;
	let Uses = [EAX], Defs = [EAX] in
	def XCHG32ar : I<0x90, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
	"xchg{l}\t{$src, %eax\|eax, $src}", []>, OpSize32;
	let Uses = [RAX], Defs = [RAX] in
	def XCHG64ar : RI<0x90, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
	"xchg{q}\t{$src, %rax\|rax, $src}", []>;
	}
	} // SchedRW

	let hasSideEffects = 0, Constraints = "$src1 = $dst1, $src2 = $dst2",
	Defs = [EFLAGS], SchedRW = [WriteXCHG] in {
	def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst1, GR8:$dst2),
	(ins GR8:$src1, GR8:$src2),
	"xadd{b}\t{$src2, $src1\|$src1, $src2}", []>, TB;
	def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst1, GR16:$dst2),
	(ins GR16:$src1, GR16:$src2),
	"xadd{w}\t{$src2, $src1\|$src1, $src2}", []>, TB, OpSize16;
	def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst1, GR32:$dst2),
	(ins GR32:$src1, GR32:$src2),
	"xadd{l}\t{$src2, $src1\|$src1, $src2}", []>, TB, OpSize32;
	def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst1, GR64:$dst2),
	(ins GR64:$src1, GR64:$src2),
	"xadd{q}\t{$src2, $src1\|$src1, $src2}", []>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$val = $dst",
	Defs = [EFLAGS], SchedRW = [WriteALULd, WriteRMW] in {
	def XADD8rm : I<0xC0, MRMSrcMem, (outs GR8:$dst),
	(ins GR8:$val, i8mem:$ptr),
	"xadd{b}\t{$val, $ptr\|$ptr, $val}", []>, TB;
	def XADD16rm : I<0xC1, MRMSrcMem, (outs GR16:$dst),
	(ins GR16:$val, i16mem:$ptr),
	"xadd{w}\t{$val, $ptr\|$ptr, $val}", []>, TB,
	OpSize16;
	def XADD32rm : I<0xC1, MRMSrcMem, (outs GR32:$dst),
	(ins GR32:$val, i32mem:$ptr),
	"xadd{l}\t{$val, $ptr\|$ptr, $val}", []>, TB,
	OpSize32;
	def XADD64rm : RI<0xC1, MRMSrcMem, (outs GR64:$dst),
	(ins GR64:$val, i64mem:$ptr),
	"xadd{q}\t{$val, $ptr\|$ptr, $val}", []>, TB;

	}

	let SchedRW = [WriteCMPXCHG], hasSideEffects = 0 in {
	let Defs = [AL, EFLAGS], Uses = [AL] in
	def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
	"cmpxchg{b}\t{$src, $dst\|$dst, $src}", []>, TB,
	NotMemoryFoldable;
	let Defs = [AX, EFLAGS], Uses = [AX] in
	def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"cmpxchg{w}\t{$src, $dst\|$dst, $src}", []>, TB, OpSize16,
	NotMemoryFoldable;
	let Defs = [EAX, EFLAGS], Uses = [EAX] in
	def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
	"cmpxchg{l}\t{$src, $dst\|$dst, $src}", []>, TB, OpSize32,
	NotMemoryFoldable;
	let Defs = [RAX, EFLAGS], Uses = [RAX] in
	def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
	"cmpxchg{q}\t{$src, $dst\|$dst, $src}", []>, TB,
	NotMemoryFoldable;
	} // SchedRW, hasSideEffects

	let SchedRW = [WriteCMPXCHGRMW], mayLoad = 1, mayStore = 1,
	hasSideEffects = 0 in {
	let Defs = [AL, EFLAGS], Uses = [AL] in
	def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
	"cmpxchg{b}\t{$src, $dst\|$dst, $src}", []>, TB,
	NotMemoryFoldable;
	let Defs = [AX, EFLAGS], Uses = [AX] in
	def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"cmpxchg{w}\t{$src, $dst\|$dst, $src}", []>, TB, OpSize16,
	NotMemoryFoldable;
	let Defs = [EAX, EFLAGS], Uses = [EAX] in
	def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"cmpxchg{l}\t{$src, $dst\|$dst, $src}", []>, TB, OpSize32,
	NotMemoryFoldable;
	let Defs = [RAX, EFLAGS], Uses = [RAX] in
	def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"cmpxchg{q}\t{$src, $dst\|$dst, $src}", []>, TB,
	NotMemoryFoldable;

	let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
	def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
	"cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>;

	let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
	// NOTE: In64BitMode check needed for the AssemblerPredicate.
	def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
	"cmpxchg16b\t$dst", []>,
	TB, Requires<[HasCmpxchg16b,In64BitMode]>;
	} // SchedRW, mayLoad, mayStore, hasSideEffects


	// Lock instruction prefix
	let SchedRW = [WriteMicrocoded] in
	def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;

	let SchedRW = [WriteNop] in {

	// Rex64 instruction prefix
	def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>,
	Requires<[In64BitMode]>;

	// Data16 instruction prefix
	def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>;
	} // SchedRW

	// Repeat string operation instruction prefixes
	let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in {
	// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
	def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
	// Repeat while not equal (used with CMPS and SCAS)
	def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
	}

	// String manipulation instructions
	let SchedRW = [WriteMicrocoded] in {
	let Defs = [AL,ESI], Uses = [ESI,DF] in
	def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
	"lodsb\t{$src, %al\|al, $src}", []>;
	let Defs = [AX,ESI], Uses = [ESI,DF] in
	def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
	"lodsw\t{$src, %ax\|ax, $src}", []>, OpSize16;
	let Defs = [EAX,ESI], Uses = [ESI,DF] in
	def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
	"lods{l\|d}\t{$src, %eax\|eax, $src}", []>, OpSize32;
	let Defs = [RAX,ESI], Uses = [ESI,DF] in
	def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
	"lodsq\t{$src, %rax\|rax, $src}", []>,
	Requires<[In64BitMode]>;
	}

	let SchedRW = [WriteSystem] in {
	let Defs = [ESI], Uses = [DX,ESI,DF] in {
	def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
	"outsb\t{$src, %dx\|dx, $src}", []>;
	def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
	"outsw\t{$src, %dx\|dx, $src}", []>, OpSize16;
	def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
	"outs{l\|d}\t{$src, %dx\|dx, $src}", []>, OpSize32;
	}

	let Defs = [EDI], Uses = [DX,EDI,DF] in {
	def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
	"insb\t{%dx, $dst\|$dst, dx}", []>;
	def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
	"insw\t{%dx, $dst\|$dst, dx}", []>, OpSize16;
	def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
	"ins{l\|d}\t{%dx, $dst\|$dst, dx}", []>, OpSize32;
	}
	}

	// EFLAGS management instructions.
	let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in {
	def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
	def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
	def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
	}

	// DF management instructions.
	let SchedRW = [WriteALU], Defs = [DF] in {
	def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
	def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
	}

	// Table lookup instructions
	let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
	def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>, Sched<[WriteLoad]>;

	let SchedRW = [WriteMicrocoded] in {
	// ASCII Adjust After Addition
	let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>,
	Requires<[Not64BitMode]>;

	// ASCII Adjust AX Before Division
	let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
	"aad\t$src", []>, Requires<[Not64BitMode]>;

	// ASCII Adjust AX After Multiply
	let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
	"aam\t$src", []>, Requires<[Not64BitMode]>;

	// ASCII Adjust AL After Subtraction - sets
	let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>,
	Requires<[Not64BitMode]>;

	// Decimal Adjust AL after Addition
	let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
	def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>,
	Requires<[Not64BitMode]>;

	// Decimal Adjust AL after Subtraction
	let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
	def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>,
	Requires<[Not64BitMode]>;
	} // SchedRW

	let SchedRW = [WriteSystem] in {
	// Check Array Index Against Bounds
	// Note: "bound" does not have reversed operands in at&t syntax.
	def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"bound\t$dst, $src", []>, OpSize16,
	Requires<[Not64BitMode]>;
	def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"bound\t$dst, $src", []>, OpSize32,
	Requires<[Not64BitMode]>;

	// Adjust RPL Field of Segment Selector
	def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"arpl\t{$src, $dst\|$dst, $src}", []>,
	Requires<[Not64BitMode]>, NotMemoryFoldable;
	let mayStore = 1 in
	def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"arpl\t{$src, $dst\|$dst, $src}", []>,
	Requires<[Not64BitMode]>, NotMemoryFoldable;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// MOVBE Instructions
	//
	let Predicates = [HasMOVBE] in {
	let SchedRW = [WriteALULd] in {
	def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"movbe{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
	OpSize16, T8PS;
	def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"movbe{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
	OpSize32, T8PS;
	def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"movbe{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
	T8PS;
	}
	let SchedRW = [WriteStore] in {
	def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"movbe{w}\t{$src, $dst\|$dst, $src}",
	[(store (bswap GR16:$src), addr:$dst)]>,
	OpSize16, T8PS;
	def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"movbe{l}\t{$src, $dst\|$dst, $src}",
	[(store (bswap GR32:$src), addr:$dst)]>,
	OpSize32, T8PS;
	def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"movbe{q}\t{$src, $dst\|$dst, $src}",
	[(store (bswap GR64:$src), addr:$dst)]>,
	T8PS;
	}
	}

	//===----------------------------------------------------------------------===//
	// RDRAND Instruction
	//
	let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
	def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
	"rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
	OpSize16, PS;
	def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
	"rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
	OpSize32, PS;
	def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
	"rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
	PS;
	}

	//===----------------------------------------------------------------------===//
	// RDSEED Instruction
	//
	let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
	def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
	[(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS;
	def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
	[(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS;
	def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
	[(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS;
	}

	//===----------------------------------------------------------------------===//
	// LZCNT Instruction
	//
	let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
	def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"lzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>,
	XS, OpSize16, Sched<[WriteLZCNT]>;
	def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"lzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (ctlz (loadi16 addr:$src))),
	(implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>;

	def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"lzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>,
	XS, OpSize32, Sched<[WriteLZCNT]>;
	def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"lzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (ctlz (loadi32 addr:$src))),
	(implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>;

	def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"lzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
	XS, Sched<[WriteLZCNT]>;
	def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"lzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (ctlz (loadi64 addr:$src))),
	(implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>;
	}

	//===----------------------------------------------------------------------===//
	// BMI Instructions
	//
	let Predicates = [HasBMI], Defs = [EFLAGS] in {
	def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"tzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>,
	XS, OpSize16, Sched<[WriteTZCNT]>;
	def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"tzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (cttz (loadi16 addr:$src))),
	(implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>;

	def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"tzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>,
	XS, OpSize32, Sched<[WriteTZCNT]>;
	def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"tzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (cttz (loadi32 addr:$src))),
	(implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>;

	def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"tzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
	XS, Sched<[WriteTZCNT]>;
	def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"tzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (cttz (loadi64 addr:$src))),
	(implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>;
	}

	multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
	RegisterClass RC, X86MemOperand x86memop,
	X86FoldableSchedWrite sched> {
	let hasSideEffects = 0 in {
	def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
	!strconcat(mnemonic, "\t{$src, $dst\|$dst, $src}"), []>,
	T8PS, VEX_4V, Sched<[sched]>;
	let mayLoad = 1 in
	def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(mnemonic, "\t{$src, $dst\|$dst, $src}"), []>,
	T8PS, VEX_4V, Sched<[sched.Folded]>;
	}
	}

	let Predicates = [HasBMI], Defs = [EFLAGS] in {
	defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
	defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W;
	defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
	defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W;
	defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
	defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W;
	}

	//===----------------------------------------------------------------------===//
	// Pattern fragments to auto generate BMI instructions.
	//===----------------------------------------------------------------------===//

	def or_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
	(X86or_flag node:$lhs, node:$rhs), [{
	return hasNoCarryFlagUses(SDValue(N, 1));
	}]>;

	def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
	(X86xor_flag node:$lhs, node:$rhs), [{
	return hasNoCarryFlagUses(SDValue(N, 1));
	}]>;

	def and_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
	(X86and_flag node:$lhs, node:$rhs), [{
	return hasNoCarryFlagUses(SDValue(N, 1));
	}]>;

	let Predicates = [HasBMI] in {
	// FIXME: patterns for the load versions are not implemented
	def : Pat<(and GR32:$src, (add GR32:$src, -1)),
	(BLSR32rr GR32:$src)>;
	def : Pat<(and GR64:$src, (add GR64:$src, -1)),
	(BLSR64rr GR64:$src)>;

	def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
	(BLSMSK32rr GR32:$src)>;
	def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
	(BLSMSK64rr GR64:$src)>;

	def : Pat<(and GR32:$src, (ineg GR32:$src)),
	(BLSI32rr GR32:$src)>;
	def : Pat<(and GR64:$src, (ineg GR64:$src)),
	(BLSI64rr GR64:$src)>;

	// Versions to match flag producing ops.
	def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, -1)),
	(BLSR32rr GR32:$src)>;
	def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, -1)),
	(BLSR64rr GR64:$src)>;

	def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)),
	(BLSMSK32rr GR32:$src)>;
	def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)),
	(BLSMSK64rr GR64:$src)>;

	def : Pat<(and_flag_nocf GR32:$src, (ineg GR32:$src)),
	(BLSI32rr GR32:$src)>;
	def : Pat<(and_flag_nocf GR64:$src, (ineg GR64:$src)),
	(BLSI64rr GR64:$src)>;
	}

	multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
	X86MemOperand x86memop, SDNode OpNode,
	PatFrag ld_frag, X86FoldableSchedWrite Sched> {
	def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
	T8PS, VEX, Sched<[Sched]>;
	def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
	(implicit EFLAGS)]>, T8PS, VEX,
	Sched<[Sched.Folded,
	// x86memop:$src1
	ReadDefault, ReadDefault, ReadDefault, ReadDefault,
	ReadDefault,
	// RC:$src2
	Sched.ReadAfterFold]>;
	}

	let Predicates = [HasBMI], Defs = [EFLAGS] in {
	defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem,
	X86bextr, loadi32, WriteBEXTR>;
	defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem,
	X86bextr, loadi64, WriteBEXTR>, VEX_W;
	}

	multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
	X86MemOperand x86memop, Intrinsic Int,
	PatFrag ld_frag, X86FoldableSchedWrite Sched> {
	def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
	T8PS, VEX, Sched<[Sched]>;
	def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
	(implicit EFLAGS)]>, T8PS, VEX,
	Sched<[Sched.Folded,
	// x86memop:$src1
	ReadDefault, ReadDefault, ReadDefault, ReadDefault,
	ReadDefault,
	// RC:$src2
	Sched.ReadAfterFold]>;
	}

	let Predicates = [HasBMI2], Defs = [EFLAGS] in {
	defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
	X86bzhi, loadi32, WriteBZHI>;
	defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
	X86bzhi, loadi64, WriteBZHI>, VEX_W;
	}

	def CountTrailingOnes : SDNodeXForm<imm, [{
	// Count the trailing ones in the immediate.
	return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
	}]>;

	def BEXTRMaskXForm : SDNodeXForm<imm, [{
	unsigned Length = countTrailingOnes(N->getZExtValue());
	return getI32Imm(Length << 8, SDLoc(N));
	}]>;

	def AndMask64 : ImmLeaf<i64, [{
	return isMask_64(Imm) && !isUInt<32>(Imm);
	}]>;

	// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
	let Predicates = [HasBMI, NoBMI2, NoTBM] in {
	def : Pat<(and GR64:$src, AndMask64:$mask),
	(BEXTR64rr GR64:$src,
	(SUBREG_TO_REG (i64 0),
	(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
	def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
	(BEXTR64rm addr:$src,
	(SUBREG_TO_REG (i64 0),
	(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
	}

	// Use BZHI for 64-bit 'and' with large immediate 'mask'.
	let Predicates = [HasBMI2, NoTBM] in {
	def : Pat<(and GR64:$src, AndMask64:$mask),
	(BZHI64rr GR64:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
	def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
	(BZHI64rm addr:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
	}

	multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
	X86MemOperand x86memop, Intrinsic Int,
	PatFrag ld_frag> {
	def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
	VEX_4V, Sched<[WriteALU]>;
	def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>,
	VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
	}

	let Predicates = [HasBMI2] in {
	defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
	int_x86_bmi_pdep_32, loadi32>, T8XD;
	defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
	int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
	defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
	int_x86_bmi_pext_32, loadi32>, T8XS;
	defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
	int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
	}

	//===----------------------------------------------------------------------===//
	// TBM Instructions
	//
	let Predicates = [HasTBM], Defs = [EFLAGS] in {

	multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
	X86MemOperand x86memop, PatFrag ld_frag,
	SDNode OpNode, Operand immtype,
	SDPatternOperator immoperator,
	X86FoldableSchedWrite Sched> {
	def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
	!strconcat(OpcodeStr,
	"\t{$cntl, $src1, $dst\|$dst, $src1, $cntl}"),
	[(set RC:$dst, (OpNode RC:$src1, immoperator:$cntl))]>,
	XOP, XOPA, Sched<[Sched]>;
	def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
	(ins x86memop:$src1, immtype:$cntl),
	!strconcat(OpcodeStr,
	"\t{$cntl, $src1, $dst\|$dst, $src1, $cntl}"),
	[(set RC:$dst, (OpNode (ld_frag addr:$src1), immoperator:$cntl))]>,
	XOP, XOPA, Sched<[Sched.Folded]>;
	}

	defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32,
	X86bextr, i32imm, imm, WriteBEXTR>;
	let ImmT = Imm32S in
	defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64,
	X86bextr, i64i32imm,
	i64immSExt32, WriteBEXTR>, VEX_W;

	multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
	RegisterClass RC, string OpcodeStr,
	X86MemOperand x86memop, X86FoldableSchedWrite Sched> {
	let hasSideEffects = 0 in {
	def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
	!strconcat(OpcodeStr,"\t{$src, $dst\|$dst, $src}"), []>,
	XOP_4V, XOP9, Sched<[Sched]>;
	let mayLoad = 1 in
	def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr,"\t{$src, $dst\|$dst, $src}"), []>,
	XOP_4V, XOP9, Sched<[Sched.Folded]>;
	}
	}

	multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
	X86FoldableSchedWrite Sched,
	Format FormReg, Format FormMem> {
	defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr#"{l}",
	i32mem, Sched>;
	defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr#"{q}",
	i64mem, Sched>, VEX_W;
	}

	defm BLCFILL : tbm_binary_intr<0x01, "blcfill", WriteALU, MRM1r, MRM1m>;
	defm BLCI : tbm_binary_intr<0x02, "blci", WriteALU, MRM6r, MRM6m>;
	defm BLCIC : tbm_binary_intr<0x01, "blcic", WriteALU, MRM5r, MRM5m>;
	defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", WriteALU, MRM1r, MRM1m>;
	defm BLCS : tbm_binary_intr<0x01, "blcs", WriteALU, MRM3r, MRM3m>;
	defm BLSFILL : tbm_binary_intr<0x01, "blsfill", WriteALU, MRM2r, MRM2m>;
	defm BLSIC : tbm_binary_intr<0x01, "blsic", WriteALU, MRM6r, MRM6m>;
	defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", WriteALU, MRM7r, MRM7m>;
	defm TZMSK : tbm_binary_intr<0x01, "tzmsk", WriteALU, MRM4r, MRM4m>;
	} // HasTBM, EFLAGS

	// Use BEXTRI for 64-bit 'and' with large immediate 'mask'.
	let Predicates = [HasTBM] in {
	def : Pat<(and GR64:$src, AndMask64:$mask),
	(BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>;

	def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
	(BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>;
	}

	//===----------------------------------------------------------------------===//
	// Lightweight Profiling Instructions

	let Predicates = [HasLWP], SchedRW = [WriteSystem] in {

	def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
	[(int_x86_llwpcb GR32:$src)]>, XOP, XOP9;
	def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
	[(set GR32:$dst, (int_x86_slwpcb))]>, XOP, XOP9;

	def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
	[(int_x86_llwpcb GR64:$src)]>, XOP, XOP9, VEX_W;
	def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
	[(set GR64:$dst, (int_x86_slwpcb))]>, XOP, XOP9, VEX_W;

	multiclass lwpins_intr<RegisterClass RC> {
	def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
	"lwpins\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
	XOP_4V, XOPA;
	let mayLoad = 1 in
	def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
	"lwpins\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
	XOP_4V, XOPA;
	}

	let Defs = [EFLAGS] in {
	defm LWPINS32 : lwpins_intr<GR32>;
	defm LWPINS64 : lwpins_intr<GR64>, VEX_W;
	} // EFLAGS

	multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
	def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
	"lwpval\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
	let mayLoad = 1 in
	def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
	"lwpval\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
	XOP_4V, XOPA;
	}

	defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
	defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;

	} // HasLWP, SchedRW

	//===----------------------------------------------------------------------===//
	// MONITORX/MWAITX Instructions
	//
	let SchedRW = [ WriteSystem ] in {
	let Uses = [ EAX, ECX, EDX ] in
	def MONITORX32rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
	TB, Requires<[ HasMWAITX, Not64BitMode ]>;
	let Uses = [ RAX, ECX, EDX ] in
	def MONITORX64rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
	TB, Requires<[ HasMWAITX, In64BitMode ]>;

	let Uses = [ ECX, EAX, EBX ] in {
	def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
	[(int_x86_mwaitx ECX, EAX, EBX)]>,
	TB, Requires<[ HasMWAITX ]>;
	}
	} // SchedRW

	def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx\|ebx, ecx, eax}", (MWAITXrrr)>,
	Requires<[ Not64BitMode ]>;
	def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx\|rbx, rcx, rax}", (MWAITXrrr)>,
	Requires<[ In64BitMode ]>;

	def : InstAlias<"monitorx\t{%eax, %ecx, %edx\|edx, ecx, eax}", (MONITORX32rrr)>,
	Requires<[ Not64BitMode ]>;
	def : InstAlias<"monitorx\t{%rax, %rcx, %rdx\|rdx, rcx, rax}", (MONITORX64rrr)>,
	Requires<[ In64BitMode ]>;

	//===----------------------------------------------------------------------===//
	// WAITPKG Instructions
	//
	let SchedRW = [WriteSystem] in {
	def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src),
	"umonitor\t$src", [(int_x86_umonitor GR16:$src)]>,
	XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
	def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
	"umonitor\t$src", [(int_x86_umonitor GR32:$src)]>,
	XS, AdSize32, Requires<[HasWAITPKG]>;
	def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
	"umonitor\t$src", [(int_x86_umonitor GR64:$src)]>,
	XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
	let Uses = [EAX, EDX], Defs = [EFLAGS] in {
	def UMWAIT : I<0xAE, MRM6r,
	(outs), (ins GR32orGR64:$src), "umwait\t$src",
	[(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>,
	XD, Requires<[HasWAITPKG]>;
	def TPAUSE : I<0xAE, MRM6r,
	(outs), (ins GR32orGR64:$src), "tpause\t$src",
	[(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
	PD, Requires<[HasWAITPKG]>, NotMemoryFoldable;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// MOVDIRI - Move doubleword/quadword as direct store
	//
	let SchedRW = [WriteStore] in {
	def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"movdiri\t{$src, $dst\|$dst, $src}",
	[(int_x86_directstore32 addr:$dst, GR32:$src)]>,
	T8, Requires<[HasMOVDIRI]>;
	def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"movdiri\t{$src, $dst\|$dst, $src}",
	[(int_x86_directstore64 addr:$dst, GR64:$src)]>,
	T8, Requires<[In64BitMode, HasMOVDIRI]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// MOVDIR64B - Move 64 bytes as direct store
	//
	let SchedRW = [WriteStore] in {
	def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
	"movdir64b\t{$src, $dst\|$dst, $src}", []>,
	T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
	def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
	"movdir64b\t{$src, $dst\|$dst, $src}",
	[(int_x86_movdir64b GR32:$dst, addr:$src)]>,
	T8PD, AdSize32, Requires<[HasMOVDIR64B]>;
	def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
	"movdir64b\t{$src, $dst\|$dst, $src}",
	[(int_x86_movdir64b GR64:$dst, addr:$src)]>,
	T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// ENQCMD/S - Enqueue 64-byte command as user with 64-byte write atomicity
	//
	let SchedRW = [WriteStore], Defs = [EFLAGS] in {
	def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
	"enqcmd\t{$src, $dst\|$dst, $src}",
	[(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>,
	T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
	def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
	"enqcmd\t{$src, $dst\|$dst, $src}",
	[(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>,
	T8XD, AdSize32, Requires<[HasENQCMD]>;
	def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
	"enqcmd\t{$src, $dst\|$dst, $src}",
	[(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>,
	T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;

	def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
	"enqcmds\t{$src, $dst\|$dst, $src}",
	[(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>,
	T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
	def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
	"enqcmds\t{$src, $dst\|$dst, $src}",
	[(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>,
	T8XS, AdSize32, Requires<[HasENQCMD]>;
	def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
	"enqcmds\t{$src, $dst\|$dst, $src}",
	[(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>,
	T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
	}

	//===----------------------------------------------------------------------===//
	// CLZERO Instruction
	//
	let SchedRW = [WriteLoad] in {
	let Uses = [EAX] in
	def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
	TB, Requires<[HasCLZERO, Not64BitMode]>;
	let Uses = [RAX] in
	def CLZERO64r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
	TB, Requires<[HasCLZERO, In64BitMode]>;
	} // SchedRW

	def : InstAlias<"clzero\t{%eax\|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
	def : InstAlias<"clzero\t{%rax\|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;

	//===----------------------------------------------------------------------===//
	// Pattern fragments to auto generate TBM instructions.
	//===----------------------------------------------------------------------===//

	let Predicates = [HasTBM] in {
	// FIXME: patterns for the load versions are not implemented
	def : Pat<(and GR32:$src, (add GR32:$src, 1)),
	(BLCFILL32rr GR32:$src)>;
	def : Pat<(and GR64:$src, (add GR64:$src, 1)),
	(BLCFILL64rr GR64:$src)>;

	def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
	(BLCI32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
	(BLCI64rr GR64:$src)>;

	// Extra patterns because opt can optimize the above patterns to this.
	def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
	(BLCI32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
	(BLCI64rr GR64:$src)>;

	def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
	(BLCIC32rr GR32:$src)>;
	def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
	(BLCIC64rr GR64:$src)>;

	def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
	(BLCMSK32rr GR32:$src)>;
	def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
	(BLCMSK64rr GR64:$src)>;

	def : Pat<(or GR32:$src, (add GR32:$src, 1)),
	(BLCS32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (add GR64:$src, 1)),
	(BLCS64rr GR64:$src)>;

	def : Pat<(or GR32:$src, (add GR32:$src, -1)),
	(BLSFILL32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (add GR64:$src, -1)),
	(BLSFILL64rr GR64:$src)>;

	def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
	(BLSIC32rr GR32:$src)>;
	def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
	(BLSIC64rr GR64:$src)>;

	def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
	(T1MSKC32rr GR32:$src)>;
	def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
	(T1MSKC64rr GR64:$src)>;

	def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
	(TZMSK32rr GR32:$src)>;
	def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
	(TZMSK64rr GR64:$src)>;

	// Patterns to match flag producing ops.
	def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
	(BLCI32rr GR32:$src)>;
	def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
	(BLCI64rr GR64:$src)>;

	// Extra patterns because opt can optimize the above patterns to this.
	def : Pat<(or_flag_nocf GR32:$src, (sub -2, GR32:$src)),
	(BLCI32rr GR32:$src)>;
	def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)),
	(BLCI64rr GR64:$src)>;

	def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
	(BLCIC32rr GR32:$src)>;
	def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
	(BLCIC64rr GR64:$src)>;

	def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)),
	(BLCMSK32rr GR32:$src)>;
	def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)),
	(BLCMSK64rr GR64:$src)>;

	def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, 1)),
	(BLCS32rr GR32:$src)>;
	def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, 1)),
	(BLCS64rr GR64:$src)>;

	def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, -1)),
	(BLSFILL32rr GR32:$src)>;
	def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, -1)),
	(BLSFILL64rr GR64:$src)>;

	def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
	(BLSIC32rr GR32:$src)>;
	def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
	(BLSIC64rr GR64:$src)>;

	def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
	(T1MSKC32rr GR32:$src)>;
	def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
	(T1MSKC64rr GR64:$src)>;

	def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
	(TZMSK32rr GR32:$src)>;
	def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
	(TZMSK64rr GR64:$src)>;
	} // HasTBM

	//===----------------------------------------------------------------------===//
	// Memory Instructions
	//

	let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
	def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
	"clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;

	let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
	def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
	[(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable;

	let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
	def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
	[(int_x86_cldemote addr:$src)]>, TB;

	//===----------------------------------------------------------------------===//
	// Subsystems.
	//===----------------------------------------------------------------------===//

	include "X86InstrArithmetic.td"
	include "X86InstrCMovSetCC.td"
	include "X86InstrExtension.td"
	include "X86InstrControl.td"
	include "X86InstrShiftRotate.td"

	// X87 Floating Point Stack.
	include "X86InstrFPStack.td"

	// SIMD support (SSE, MMX and AVX)
	include "X86InstrFragmentsSIMD.td"

	// FMA - Fused Multiply-Add support (requires FMA)
	include "X86InstrFMA.td"

	// XOP
	include "X86InstrXOP.td"

	// SSE, MMX and 3DNow! vector support.
	include "X86InstrSSE.td"
	include "X86InstrAVX512.td"
	include "X86InstrMMX.td"
	include "X86Instr3DNow.td"

	// MPX instructions
	include "X86InstrMPX.td"

	include "X86InstrVMX.td"
	include "X86InstrSVM.td"

	include "X86InstrTSX.td"
	include "X86InstrSGX.td"

	// System instructions.
	include "X86InstrSystem.td"

	// Compiler Pseudo Instructions and Pat Patterns
	include "X86InstrCompiler.td"
	include "X86InstrVecCompiler.td"

	//===----------------------------------------------------------------------===//
	// Assembler Mnemonic Aliases
	//===----------------------------------------------------------------------===//

	def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;

	def : MnemonicAlias<"cbw", "cbtw", "att">;
	def : MnemonicAlias<"cwde", "cwtl", "att">;
	def : MnemonicAlias<"cwd", "cwtd", "att">;
	def : MnemonicAlias<"cdq", "cltd", "att">;
	def : MnemonicAlias<"cdqe", "cltq", "att">;
	def : MnemonicAlias<"cqo", "cqto", "att">;

	// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
	def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;

	def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
	def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;

	def : MnemonicAlias<"loopz", "loope">;
	def : MnemonicAlias<"loopnz", "loopne">;

	def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"popfd", "popfl", "att">;
	def : MnemonicAlias<"popfw", "popf", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"popfw", "popf", "intel">, Requires<[In64BitMode]>;

	// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
	// all modes. However: "push (addr)" and "push $42" should default to
	// pushl/pushq depending on the current mode. Similar for "pop %bx"
	def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"pushfd", "pushfl", "att">;
	def : MnemonicAlias<"pushfw", "pushf", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pushfw", "pushf", "intel">, Requires<[In64BitMode]>;

	def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>;
	def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
	def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>;

	def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>;

	def : MnemonicAlias<"repe", "rep">;
	def : MnemonicAlias<"repz", "rep">;
	def : MnemonicAlias<"repnz", "repne">;

	def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;

	// Apply 'ret' behavior to 'retn'
	def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"retn", "ret", "intel">;

	def : MnemonicAlias<"sal", "shl", "intel">;
	def : MnemonicAlias<"salb", "shlb", "att">;
	def : MnemonicAlias<"salw", "shlw", "att">;
	def : MnemonicAlias<"sall", "shll", "att">;
	def : MnemonicAlias<"salq", "shlq", "att">;

	def : MnemonicAlias<"smovb", "movsb", "att">;
	def : MnemonicAlias<"smovw", "movsw", "att">;
	def : MnemonicAlias<"smovl", "movsl", "att">;
	def : MnemonicAlias<"smovq", "movsq", "att">;

	def : MnemonicAlias<"ud2a", "ud2", "att">;
	def : MnemonicAlias<"verrw", "verr", "att">;

	// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release'
	def : MnemonicAlias<"acquire", "xacquire", "intel">;
	def : MnemonicAlias<"release", "xrelease", "intel">;

	// System instruction aliases.
	def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>;
	def : MnemonicAlias<"sysret", "sysretl", "att">;
	def : MnemonicAlias<"sysexit", "sysexitl", "att">;

	def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"lgdt", "lgdtw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lgdt", "lgdtd", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"lidt", "lidtw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lidt", "lidtd", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtd", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"sidt", "sidtw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"sidt", "sidtd", "intel">, Requires<[In32BitMode]>;


	// Floating point stack aliases.
	def : MnemonicAlias<"fcmovz", "fcmove", "att">;
	def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
	def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
	def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
	def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
	def : MnemonicAlias<"fcomip", "fcompi">;
	def : MnemonicAlias<"fildq", "fildll", "att">;
	def : MnemonicAlias<"fistpq", "fistpll", "att">;
	def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
	def : MnemonicAlias<"fldcww", "fldcw", "att">;
	def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
	def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
	def : MnemonicAlias<"fucomip", "fucompi">;
	def : MnemonicAlias<"fwait", "wait">;

	def : MnemonicAlias<"fxsaveq", "fxsave64", "att">;
	def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">;
	def : MnemonicAlias<"xsaveq", "xsave64", "att">;
	def : MnemonicAlias<"xrstorq", "xrstor64", "att">;
	def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
	def : MnemonicAlias<"xrstorsq", "xrstors64", "att">;
	def : MnemonicAlias<"xsavecq", "xsavec64", "att">;
	def : MnemonicAlias<"xsavesq", "xsaves64", "att">;

	class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
	string VariantName>
	: MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
	!strconcat(Prefix, NewCond, Suffix), VariantName>;

	/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
	/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
	/// example "setz" -> "sete".
	multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
	string V = ""> {
	def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
	def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
	def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
	def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
	def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
	def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
	def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
	def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
	def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
	def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp

	def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
	def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
	def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
	def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
	}

	// Aliases for set<CC>
	defm : IntegerCondCodeMnemonicAlias<"set", "">;
	// Aliases for j<CC>
	defm : IntegerCondCodeMnemonicAlias<"j", "">;
	// Aliases for cmov<CC>{w,l,q}
	defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
	defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
	defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
	// No size suffix for intel-style asm.
	defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;


	//===----------------------------------------------------------------------===//
	// Assembler Instruction Aliases
	//===----------------------------------------------------------------------===//

	// aad/aam default to base 10 if no operand is specified.
	def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
	def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;

	// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
	// Likewise for btc/btr/bts.
	def : InstAlias<"bt\t{$imm, $mem\|$mem, $imm}",
	(BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
	def : InstAlias<"btc\t{$imm, $mem\|$mem, $imm}",
	(BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
	def : InstAlias<"btr\t{$imm, $mem\|$mem, $imm}",
	(BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
	def : InstAlias<"bts\t{$imm, $mem\|$mem, $imm}",
	(BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;

	// clr aliases.
	def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
	def : InstAlias<"clr{w}\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
	def : InstAlias<"clr{l}\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
	def : InstAlias<"clr{q}\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;

	// lods aliases. Accept the destination being omitted because it's implicit
	// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>;
	def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
	def : InstAlias<"lods{l\|d}\t$src", (LODSL srcidx32:$src), 0>;
	def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"lods\t{$src, %al\|al, $src}", (LODSB srcidx8:$src), 0>;
	def : InstAlias<"lods\t{$src, %ax\|ax, $src}", (LODSW srcidx16:$src), 0>;
	def : InstAlias<"lods\t{$src, %eax\|eax, $src}", (LODSL srcidx32:$src), 0>;
	def : InstAlias<"lods\t{$src, %rax\|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0, "intel">;
	def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0, "intel">;
	def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0, "intel">;
	def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;


	// stos aliases. Accept the source being omitted because it's implicit in
	// the mnemonic, or the mnemonic suffix being omitted because it's implicit
	// in the source.
	def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>;
	def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
	def : InstAlias<"stos{l\|d}\t$dst", (STOSL dstidx32:$dst), 0>;
	def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"stos\t{%al, $dst\|$dst, al}", (STOSB dstidx8:$dst), 0>;
	def : InstAlias<"stos\t{%ax, $dst\|$dst, ax}", (STOSW dstidx16:$dst), 0>;
	def : InstAlias<"stos\t{%eax, $dst\|$dst, eax}", (STOSL dstidx32:$dst), 0>;
	def : InstAlias<"stos\t{%rax, $dst\|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0, "intel">;
	def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0, "intel">;
	def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0, "intel">;
	def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;


	// scas aliases. Accept the destination being omitted because it's implicit
	// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>;
	def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
	def : InstAlias<"scas{l\|d}\t$dst", (SCASL dstidx32:$dst), 0>;
	def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"scas\t{$dst, %al\|al, $dst}", (SCASB dstidx8:$dst), 0>;
	def : InstAlias<"scas\t{$dst, %ax\|ax, $dst}", (SCASW dstidx16:$dst), 0>;
	def : InstAlias<"scas\t{$dst, %eax\|eax, $dst}", (SCASL dstidx32:$dst), 0>;
	def : InstAlias<"scas\t{$dst, %rax\|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0, "intel">;
	def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0, "intel">;
	def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0, "intel">;
	def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;

	// cmps aliases. Mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0, "intel">;
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;

	// movs aliases. Mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0, "intel">;
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;

	// div and idiv aliases for explicit A register.
	def : InstAlias<"div{b}\t{$src, %al\|al, $src}", (DIV8r GR8 :$src)>;
	def : InstAlias<"div{w}\t{$src, %ax\|ax, $src}", (DIV16r GR16:$src)>;
	def : InstAlias<"div{l}\t{$src, %eax\|eax, $src}", (DIV32r GR32:$src)>;
	def : InstAlias<"div{q}\t{$src, %rax\|rax, $src}", (DIV64r GR64:$src)>;
	def : InstAlias<"div{b}\t{$src, %al\|al, $src}", (DIV8m i8mem :$src)>;
	def : InstAlias<"div{w}\t{$src, %ax\|ax, $src}", (DIV16m i16mem:$src)>;
	def : InstAlias<"div{l}\t{$src, %eax\|eax, $src}", (DIV32m i32mem:$src)>;
	def : InstAlias<"div{q}\t{$src, %rax\|rax, $src}", (DIV64m i64mem:$src)>;
	def : InstAlias<"idiv{b}\t{$src, %al\|al, $src}", (IDIV8r GR8 :$src)>;
	def : InstAlias<"idiv{w}\t{$src, %ax\|ax, $src}", (IDIV16r GR16:$src)>;
	def : InstAlias<"idiv{l}\t{$src, %eax\|eax, $src}", (IDIV32r GR32:$src)>;
	def : InstAlias<"idiv{q}\t{$src, %rax\|rax, $src}", (IDIV64r GR64:$src)>;
	def : InstAlias<"idiv{b}\t{$src, %al\|al, $src}", (IDIV8m i8mem :$src)>;
	def : InstAlias<"idiv{w}\t{$src, %ax\|ax, $src}", (IDIV16m i16mem:$src)>;
	def : InstAlias<"idiv{l}\t{$src, %eax\|eax, $src}", (IDIV32m i32mem:$src)>;
	def : InstAlias<"idiv{q}\t{$src, %rax\|rax, $src}", (IDIV64m i64mem:$src)>;



	// Various unary fpstack operations default to operating on ST1.
	// For example, "fxch" -> "fxch %st(1)"
	def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
	def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
	def : InstAlias<"fsub{\|r}p", (SUBR_FPrST0 ST1), 0>;
	def : InstAlias<"fsub{r\|}p", (SUB_FPrST0 ST1), 0>;
	def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>;
	def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
	def : InstAlias<"fdiv{\|r}p", (DIVR_FPrST0 ST1), 0>;
	def : InstAlias<"fdiv{r\|}p", (DIV_FPrST0 ST1), 0>;
	def : InstAlias<"fxch", (XCH_F ST1), 0>;
	def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
	def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
	def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
	def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
	def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
	def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
	def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
	def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;

	// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
	// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
	// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
	// gas.
	multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
	def : InstAlias<!strconcat(Mnemonic, "\t$op"),
	(Inst RSTi:$op), EmitAlias>;
	def : InstAlias<!strconcat(Mnemonic, "\t{%st, %st\|st, st}"),
	(Inst ST0), EmitAlias>;
	}

	defm : FpUnaryAlias<"fadd", ADD_FST0r, 0>;
	defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
	defm : FpUnaryAlias<"fsub", SUB_FST0r, 0>;
	defm : FpUnaryAlias<"fsub{\|r}p", SUBR_FPrST0, 0>;
	defm : FpUnaryAlias<"fsubr", SUBR_FST0r, 0>;
	defm : FpUnaryAlias<"fsub{r\|}p", SUB_FPrST0, 0>;
	defm : FpUnaryAlias<"fmul", MUL_FST0r, 0>;
	defm : FpUnaryAlias<"fmulp", MUL_FPrST0, 0>;
	defm : FpUnaryAlias<"fdiv", DIV_FST0r, 0>;
	defm : FpUnaryAlias<"fdiv{\|r}p", DIVR_FPrST0, 0>;
	defm : FpUnaryAlias<"fdivr", DIVR_FST0r, 0>;
	defm : FpUnaryAlias<"fdiv{r\|}p", DIV_FPrST0, 0>;
	defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
	defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
	defm : FpUnaryAlias<"fcompi", COM_FIPr, 0>;
	defm : FpUnaryAlias<"fucompi", UCOM_FIPr, 0>;


	// Handle "f{mulp,addp} $op, %st(0)" the same as "f{mulp,addp} $op", since they
	// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
	// solely because gas supports it.
	def : InstAlias<"faddp\t{$op, %st\|st, $op}", (ADD_FPrST0 RSTi:$op), 0>;
	def : InstAlias<"fmulp\t{$op, %st\|st, $op}", (MUL_FPrST0 RSTi:$op), 0>;
	def : InstAlias<"fsub{\|r}p\t{$op, %st\|st, $op}", (SUBR_FPrST0 RSTi:$op), 0>;
	def : InstAlias<"fsub{r\|}p\t{$op, %st\|st, $op}", (SUB_FPrST0 RSTi:$op), 0>;
	def : InstAlias<"fdiv{\|r}p\t{$op, %st\|st, $op}", (DIVR_FPrST0 RSTi:$op), 0>;
	def : InstAlias<"fdiv{r\|}p\t{$op, %st\|st, $op}", (DIV_FPrST0 RSTi:$op), 0>;

	def : InstAlias<"fnstsw" , (FNSTSW16r), 0>;

	// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
	// this is compatible with what GAS does.
	def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
	def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
	def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
	def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
	def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;

	def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0, "att">, Requires<[In64BitMode]>;
	def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0, "att">, Requires<[In32BitMode]>;
	def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0, "att">, Requires<[In16BitMode]>;


	// "imul <imm>, B" is an alias for "imul <imm>, B, B".
	def : InstAlias<"imul{w}\t{$imm, $r\|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
	def : InstAlias<"imul{w}\t{$imm, $r\|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
	def : InstAlias<"imul{l}\t{$imm, $r\|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
	def : InstAlias<"imul{l}\t{$imm, $r\|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
	def : InstAlias<"imul{q}\t{$imm, $r\|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
	def : InstAlias<"imul{q}\t{$imm, $r\|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;

	// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"ins\t{%dx, $dst\|$dst, dx}", (INSB dstidx8:$dst), 0, "intel">;
	def : InstAlias<"ins\t{%dx, $dst\|$dst, dx}", (INSW dstidx16:$dst), 0, "intel">;
	def : InstAlias<"ins\t{%dx, $dst\|$dst, dx}", (INSL dstidx32:$dst), 0, "intel">;

	// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
	// in the source.
	def : InstAlias<"outs\t{$src, %dx\|dx, $src}", (OUTSB srcidx8:$src), 0, "intel">;
	def : InstAlias<"outs\t{$src, %dx\|dx, $src}", (OUTSW srcidx16:$src), 0, "intel">;
	def : InstAlias<"outs\t{$src, %dx\|dx, $src}", (OUTSL srcidx32:$src), 0, "intel">;

	// inb %dx -> inb %al, %dx
	def : InstAlias<"inb\t{%dx\|dx}", (IN8rr), 0>;
	def : InstAlias<"inw\t{%dx\|dx}", (IN16rr), 0>;
	def : InstAlias<"inl\t{%dx\|dx}", (IN32rr), 0>;
	def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
	def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
	def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;


	// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
	def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
	def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
	def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
	def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
	def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
	def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
	def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
	def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;

	// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
	def : InstAlias<"mov{q}\t{$imm, $reg\|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;

	// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas,
	// which supports this due to an old AMD documentation bug when 64-bit mode was
	// created.
	def : InstAlias<"movd\t{$src, $dst\|$dst, $src}",
	(MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
	def : InstAlias<"movd\t{$src, $dst\|$dst, $src}",
	(MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;

	// movsx aliases
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0, "att">;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0, "att">;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0, "att">;

	// movzx aliases
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0, "att">;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0, "att">;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0, "att">;
	// Note: No GR32->GR64 movzx form.

	// outb %dx -> outb %al, %dx
	def : InstAlias<"outb\t{%dx\|dx}", (OUT8rr), 0>;
	def : InstAlias<"outw\t{%dx\|dx}", (OUT16rr), 0>;
	def : InstAlias<"outl\t{%dx\|dx}", (OUT32rr), 0>;
	def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
	def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
	def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;

	// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
	// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
	// errors, since its encoding is the most compact.
	def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;

	// shld/shrd op,op -> shld op, op, CL
	def : InstAlias<"shld{w}\t{$r2, $r1\|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
	def : InstAlias<"shld{l}\t{$r2, $r1\|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
	def : InstAlias<"shld{q}\t{$r2, $r1\|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
	def : InstAlias<"shrd{w}\t{$r2, $r1\|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
	def : InstAlias<"shrd{l}\t{$r2, $r1\|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
	def : InstAlias<"shrd{q}\t{$r2, $r1\|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;

	def : InstAlias<"shld{w}\t{$reg, $mem\|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
	def : InstAlias<"shld{l}\t{$reg, $mem\|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
	def : InstAlias<"shld{q}\t{$reg, $mem\|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
	def : InstAlias<"shrd{w}\t{$reg, $mem\|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
	def : InstAlias<"shrd{l}\t{$reg, $mem\|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
	def : InstAlias<"shrd{q}\t{$reg, $mem\|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;

	/* FIXME: This is disabled because the asm matcher is currently incapable of
	* matching a fixed immediate like $1.
	// "shl X, $1" is an alias for "shl X".
	multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
	def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
	}

	defm : ShiftRotateByOneAlias<"rcl", "RCL">;
	defm : ShiftRotateByOneAlias<"rcr", "RCR">;
	defm : ShiftRotateByOneAlias<"rol", "ROL">;
	defm : ShiftRotateByOneAlias<"ror", "ROR">;
	FIXME */

	// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
	def : InstAlias<"test{b}\t{$mem, $val\|$val, $mem}",
	(TEST8mr i8mem :$mem, GR8 :$val), 0>;
	def : InstAlias<"test{w}\t{$mem, $val\|$val, $mem}",
	(TEST16mr i16mem:$mem, GR16:$val), 0>;
	def : InstAlias<"test{l}\t{$mem, $val\|$val, $mem}",
	(TEST32mr i32mem:$mem, GR32:$val), 0>;
	def : InstAlias<"test{q}\t{$mem, $val\|$val, $mem}",
	(TEST64mr i64mem:$mem, GR64:$val), 0>;

	// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
	def : InstAlias<"xchg{b}\t{$mem, $val\|$val, $mem}",
	(XCHG8rm GR8 :$val, i8mem :$mem), 0>;
	def : InstAlias<"xchg{w}\t{$mem, $val\|$val, $mem}",
	(XCHG16rm GR16:$val, i16mem:$mem), 0>;
	def : InstAlias<"xchg{l}\t{$mem, $val\|$val, $mem}",
	(XCHG32rm GR32:$val, i32mem:$mem), 0>;
	def : InstAlias<"xchg{q}\t{$mem, $val\|$val, $mem}",
	(XCHG64rm GR64:$val, i64mem:$mem), 0>;

	// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
	def : InstAlias<"xchg{w}\t{%ax, $src\|$src, ax}", (XCHG16ar GR16:$src), 0>;
	def : InstAlias<"xchg{l}\t{%eax, $src\|$src, eax}", (XCHG32ar GR32:$src), 0>;
	def : InstAlias<"xchg{q}\t{%rax, $src\|$src, rax}", (XCHG64ar GR64:$src), 0>;

	// In 64-bit mode, xchg %eax, %eax can't be encoded with the 0x90 opcode we
	// would get by default because it's defined as NOP. But xchg %eax, %eax implies
	// implicit zeroing of the upper 32 bits. So alias to the longer encoding.
	def : InstAlias<"xchg{l}\t{%eax, %eax\|eax, eax}",
	(XCHG32rr EAX, EAX), 0>, Requires<[In64BitMode]>;

	// xchg %rax, %rax is a nop in x86-64 and can be encoded as such. Without this
	// we emit an unneeded REX.w prefix.
	def : InstAlias<"xchg{q}\t{%rax, %rax\|rax, rax}", (NOOP), 0>;

	// These aliases exist to get the parser to prioritize matching 8-bit
	// immediate encodings over matching the implicit ax/eax/rax encodings. By
	// explicitly mentioning the A register here, these entries will be ordered
	// first due to the more explicit immediate type.
	def : InstAlias<"adc{w}\t{$imm, %ax\|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"add{w}\t{$imm, %ax\|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"and{w}\t{$imm, %ax\|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"cmp{w}\t{$imm, %ax\|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"or{w}\t{$imm, %ax\|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"sbb{w}\t{$imm, %ax\|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"sub{w}\t{$imm, %ax\|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"xor{w}\t{$imm, %ax\|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;

	def : InstAlias<"adc{l}\t{$imm, %eax\|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"add{l}\t{$imm, %eax\|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"and{l}\t{$imm, %eax\|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"cmp{l}\t{$imm, %eax\|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"or{l}\t{$imm, %eax\|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"sbb{l}\t{$imm, %eax\|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"sub{l}\t{$imm, %eax\|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"xor{l}\t{$imm, %eax\|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;

	def : InstAlias<"adc{q}\t{$imm, %rax\|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"add{q}\t{$imm, %rax\|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"and{q}\t{$imm, %rax\|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"cmp{q}\t{$imm, %rax\|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"or{q}\t{$imm, %rax\|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"sbb{q}\t{$imm, %rax\|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"sub{q}\t{$imm, %rax\|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"xor{q}\t{$imm, %rax\|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp (revision 362609)
	@@ -0,0 +1,900 @@
	+//==-- X86LoadValueInjectionLoadHardening.cpp - LVI load hardening for x86 --=//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+///
	+/// Description: This pass finds Load Value Injection (LVI) gadgets consisting
	+/// of a load from memory (i.e., SOURCE), and any operation that may transmit
	+/// the value loaded from memory over a covert channel, or use the value loaded
	+/// from memory to determine a branch/call target (i.e., SINK). After finding
	+/// all such gadgets in a given function, the pass minimally inserts LFENCE
	+/// instructions in such a manner that the following property is satisfied: for
	+/// all SOURCE+SINK pairs, all paths in the CFG from SOURCE to SINK contain at
	+/// least one LFENCE instruction. The algorithm that implements this minimal
	+/// insertion is influenced by an academic paper that minimally inserts memory
	+/// fences for high-performance concurrent programs:
	+/// http://www.cs.ucr.edu/~lesani/companion/oopsla15/OOPSLA15.pdf
	+/// The algorithm implemented in this pass is as follows:
	+/// 1. Build a condensed CFG (i.e., a GadgetGraph) consisting only of the
	+/// following components:
	+/// - SOURCE instructions (also includes function arguments)
	+/// - SINK instructions
	+/// - Basic block entry points
	+/// - Basic block terminators
	+/// - LFENCE instructions
	+/// 2. Analyze the GadgetGraph to determine which SOURCE+SINK pairs (i.e.,
	+/// gadgets) are already mitigated by existing LFENCEs. If all gadgets have been
	+/// mitigated, go to step 6.
	+/// 3. Use a heuristic or plugin to approximate minimal LFENCE insertion.
	+/// 4. Insert one LFENCE along each CFG edge that was cut in step 3.
	+/// 5. Go to step 2.
	+/// 6. If any LFENCEs were inserted, return `true` from runOnMachineFunction()
	+/// to tell LLVM that the function was modified.
	+///
	+//===----------------------------------------------------------------------===//
	+
	+#include "ImmutableGraph.h"
	+#include "X86.h"
	+#include "X86Subtarget.h"
	+#include "X86TargetMachine.h"
	+#include "llvm/ADT/DenseMap.h"
	+#include "llvm/ADT/DenseSet.h"
	+#include "llvm/ADT/SmallSet.h"
	+#include "llvm/ADT/Statistic.h"
	+#include "llvm/ADT/StringRef.h"
	+#include "llvm/CodeGen/MachineBasicBlock.h"
	+#include "llvm/CodeGen/MachineDominanceFrontier.h"
	+#include "llvm/CodeGen/MachineDominators.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineFunctionPass.h"
	+#include "llvm/CodeGen/MachineInstr.h"
	+#include "llvm/CodeGen/MachineInstrBuilder.h"
	+#include "llvm/CodeGen/MachineLoopInfo.h"
	+#include "llvm/CodeGen/MachineRegisterInfo.h"
	+#include "llvm/CodeGen/RDFGraph.h"
	+#include "llvm/CodeGen/RDFLiveness.h"
	+#include "llvm/InitializePasses.h"
	+#include "llvm/Support/CommandLine.h"
	+#include "llvm/Support/DOTGraphTraits.h"
	+#include "llvm/Support/Debug.h"
	+#include "llvm/Support/DynamicLibrary.h"
	+#include "llvm/Support/GraphWriter.h"
	+#include "llvm/Support/raw_ostream.h"
	+
	+using namespace llvm;
	+
	+#define PASS_KEY "x86-lvi-load"
	+#define DEBUG_TYPE PASS_KEY
	+
	+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
	+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
	+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
	+ "were deployed");
	+STATISTIC(NumGadgets, "Number of LVI gadgets detected during analysis");
	+
	+static cl::opt<std::string> OptimizePluginPath(
	+ PASS_KEY "-opt-plugin",
	+ cl::desc("Specify a plugin to optimize LFENCE insertion"), cl::Hidden);
	+
	+static cl::opt<bool> NoConditionalBranches(
	+ PASS_KEY "-no-cbranch",
	+ cl::desc("Don't treat conditional branches as disclosure gadgets. This "
	+ "may improve performance, at the cost of security."),
	+ cl::init(false), cl::Hidden);
	+
	+static cl::opt<bool> EmitDot(
	+ PASS_KEY "-dot",
	+ cl::desc(
	+ "For each function, emit a dot graph depicting potential LVI gadgets"),
	+ cl::init(false), cl::Hidden);
	+
	+static cl::opt<bool> EmitDotOnly(
	+ PASS_KEY "-dot-only",
	+ cl::desc("For each function, emit a dot graph depicting potential LVI "
	+ "gadgets, and do not insert any fences"),
	+ cl::init(false), cl::Hidden);
	+
	+static cl::opt<bool> EmitDotVerify(
	+ PASS_KEY "-dot-verify",
	+ cl::desc("For each function, emit a dot graph to stdout depicting "
	+ "potential LVI gadgets, used for testing purposes only"),
	+ cl::init(false), cl::Hidden);
	+
	+static llvm::sys::DynamicLibrary OptimizeDL;
	+typedef int (OptimizeCutT)(unsigned int nodes, unsigned int nodes_size,
	+ unsigned int edges, int edge_values,
	+ int cut_edges / out */, unsigned int edges_size);
	+static OptimizeCutT OptimizeCut = nullptr;
	+
	+namespace {
	+
	+struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
	+ static constexpr int GadgetEdgeSentinel = -1;
	+ static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
	+
	+ using GraphT = ImmutableGraph<MachineInstr *, int>;
	+ using Node = typename GraphT::Node;
	+ using Edge = typename GraphT::Edge;
	+ using size_type = typename GraphT::size_type;
	+ MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
	+ std::unique_ptr<Edge[]> Edges, size_type NodesSize,
	+ size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
	+ : GraphT(std::move(Nodes), std::move(Edges), NodesSize, EdgesSize),
	+ NumFences(NumFences), NumGadgets(NumGadgets) {}
	+ static inline bool isCFGEdge(const Edge &E) {
	+ return E.getValue() != GadgetEdgeSentinel;
	+ }
	+ static inline bool isGadgetEdge(const Edge &E) {
	+ return E.getValue() == GadgetEdgeSentinel;
	+ }
	+ int NumFences;
	+ int NumGadgets;
	+};
	+
	+class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass {
	+public:
	+ X86LoadValueInjectionLoadHardeningPass() : MachineFunctionPass(ID) {}
	+
	+ StringRef getPassName() const override {
	+ return "X86 Load Value Injection (LVI) Load Hardening";
	+ }
	+ void getAnalysisUsage(AnalysisUsage &AU) const override;
	+ bool runOnMachineFunction(MachineFunction &MF) override;
	+
	+ static char ID;
	+
	+private:
	+ using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
	+ using EdgeSet = MachineGadgetGraph::EdgeSet;
	+ using NodeSet = MachineGadgetGraph::NodeSet;
	+ using Gadget = std::pair<MachineInstr , MachineInstr >;
	+
	+ const X86Subtarget *STI;
	+ const TargetInstrInfo *TII;
	+ const TargetRegisterInfo *TRI;
	+
	+ std::unique_ptr<MachineGadgetGraph>
	+ getGadgetGraph(MachineFunction &MF, const MachineLoopInfo &MLI,
	+ const MachineDominatorTree &MDT,
	+ const MachineDominanceFrontier &MDF) const;
	+ int hardenLoadsWithPlugin(MachineFunction &MF,
	+ std::unique_ptr<MachineGadgetGraph> Graph) const;
	+ int hardenLoadsWithGreedyHeuristic(
	+ MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const;
	+ int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
	+ EdgeSet &ElimEdges /* in, out */,
	+ NodeSet &ElimNodes /* in, out */) const;
	+ std::unique_ptr<MachineGadgetGraph>
	+ trimMitigatedEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
	+ void findAndCutEdges(MachineGadgetGraph &G,
	+ EdgeSet &CutEdges /* out */) const;
	+ int insertFences(MachineFunction &MF, MachineGadgetGraph &G,
	+ EdgeSet &CutEdges /* in, out */) const;
	+ bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
	+ bool instrUsesRegToBranch(const MachineInstr &I, unsigned Reg) const;
	+ inline bool isFence(const MachineInstr *MI) const {
	+ return MI && (MI->getOpcode() == X86::LFENCE \|\|
	+ (STI->useLVIControlFlowIntegrity() && MI->isCall()));
	+ }
	+};
	+
	+} // end anonymous namespace
	+
	+namespace llvm {
	+
	+template <>
	+struct GraphTraits<MachineGadgetGraph *>
	+ : GraphTraits<ImmutableGraph<MachineInstr , int> > {};
	+
	+template <>
	+struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
	+ using GraphType = MachineGadgetGraph;
	+ using Traits = llvm::GraphTraits<GraphType *>;
	+ using NodeRef = typename Traits::NodeRef;
	+ using EdgeRef = typename Traits::EdgeRef;
	+ using ChildIteratorType = typename Traits::ChildIteratorType;
	+ using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
	+
	+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
	+
	+ std::string getNodeLabel(NodeRef Node, GraphType *) {
	+ if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
	+ return "ARGS";
	+
	+ std::string Str;
	+ raw_string_ostream OS(Str);
	+ OS << *Node->getValue();
	+ return OS.str();
	+ }
	+
	+ static std::string getNodeAttributes(NodeRef Node, GraphType *) {
	+ MachineInstr *MI = Node->getValue();
	+ if (MI == MachineGadgetGraph::ArgNodeSentinel)
	+ return "color = blue";
	+ if (MI->getOpcode() == X86::LFENCE)
	+ return "color = green";
	+ return "";
	+ }
	+
	+ static std::string getEdgeAttributes(NodeRef, ChildIteratorType E,
	+ GraphType *) {
	+ int EdgeVal = (*E.getCurrent()).getValue();
	+ return EdgeVal >= 0 ? "label = " + std::to_string(EdgeVal)
	+ : "color = red, style = \"dashed\"";
	+ }
	+};
	+
	+} // end namespace llvm
	+
	+constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
	+constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
	+
	+char X86LoadValueInjectionLoadHardeningPass::ID = 0;
	+
	+void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
	+ AnalysisUsage &AU) const {
	+ MachineFunctionPass::getAnalysisUsage(AU);
	+ AU.addRequired<MachineLoopInfo>();
	+ AU.addRequired<MachineDominatorTree>();
	+ AU.addRequired<MachineDominanceFrontier>();
	+ AU.setPreservesCFG();
	+}
	+
	+static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF,
	+ MachineGadgetGraph *G) {
	+ WriteGraph(OS, G, /ShortNames/ false,
	+ "Speculative gadgets for \"" + MF.getName() + "\" function");
	+}
	+
	+bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
	+ MachineFunction &MF) {
	+ LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
	+ << " *****\n");
	+ STI = &MF.getSubtarget<X86Subtarget>();
	+ if (!STI->useLVILoadHardening())
	+ return false;
	+
	+ // FIXME: support 32-bit
	+ if (!STI->is64Bit())
	+ report_fatal_error("LVI load hardening is only supported on 64-bit", false);
	+
	+ // Don't skip functions with the "optnone" attr but participate in opt-bisect.
	+ const Function &F = MF.getFunction();
	+ if (!F.hasOptNone() && skipFunction(F))
	+ return false;
	+
	+ ++NumFunctionsConsidered;
	+ TII = STI->getInstrInfo();
	+ TRI = STI->getRegisterInfo();
	+ LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
	+ const auto &MLI = getAnalysis<MachineLoopInfo>();
	+ const auto &MDT = getAnalysis<MachineDominatorTree>();
	+ const auto &MDF = getAnalysis<MachineDominanceFrontier>();
	+ std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF);
	+ LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n");
	+ if (Graph == nullptr)
	+ return false; // didn't find any gadgets
	+
	+ if (EmitDotVerify) {
	+ WriteGadgetGraph(outs(), MF, Graph.get());
	+ return false;
	+ }
	+
	+ if (EmitDot \|\| EmitDotOnly) {
	+ LLVM_DEBUG(dbgs() << "Emitting gadget graph...\n");
	+ std::error_code FileError;
	+ std::string FileName = "lvi.";
	+ FileName += MF.getName();
	+ FileName += ".dot";
	+ raw_fd_ostream FileOut(FileName, FileError);
	+ if (FileError)
	+ errs() << FileError.message();
	+ WriteGadgetGraph(FileOut, MF, Graph.get());
	+ FileOut.close();
	+ LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
	+ if (EmitDotOnly)
	+ return false;
	+ }
	+
	+ int FencesInserted;
	+ if (!OptimizePluginPath.empty()) {
	+ if (!OptimizeDL.isValid()) {
	+ std::string ErrorMsg;
	+ OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary(
	+ OptimizePluginPath.c_str(), &ErrorMsg);
	+ if (!ErrorMsg.empty())
	+ report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"');
	+ OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut");
	+ if (!OptimizeCut)
	+ report_fatal_error("Invalid optimization plugin");
	+ }
	+ FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
	+ } else { // Use the default greedy heuristic
	+ FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph));
	+ }
	+
	+ if (FencesInserted > 0)
	+ ++NumFunctionsMitigated;
	+ NumFences += FencesInserted;
	+ return (FencesInserted > 0);
	+}
	+
	+std::unique_ptr<MachineGadgetGraph>
	+X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
	+ MachineFunction &MF, const MachineLoopInfo &MLI,
	+ const MachineDominatorTree &MDT,
	+ const MachineDominanceFrontier &MDF) const {
	+ using namespace rdf;
	+
	+ // Build the Register Dataflow Graph using the RDF framework
	+ TargetOperandInfo TOI{*TII};
	+ DataFlowGraph DFG{MF, TII, TRI, MDT, MDF, TOI};
	+ DFG.build();
	+ Liveness L{MF.getRegInfo(), DFG};
	+ L.computePhiInfo();
	+
	+ GraphBuilder Builder;
	+ using GraphIter = typename GraphBuilder::BuilderNodeRef;
	+ DenseMap<MachineInstr *, GraphIter> NodeMap;
	+ int FenceCount = 0, GadgetCount = 0;
	+ auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
	+ auto Ref = NodeMap.find(MI);
	+ if (Ref == NodeMap.end()) {
	+ auto I = Builder.addVertex(MI);
	+ NodeMap[MI] = I;
	+ return std::pair<GraphIter, bool>{I, true};
	+ }
	+ return std::pair<GraphIter, bool>{Ref->getSecond(), false};
	+ };
	+
	+ // The `Transmitters` map memoizes transmitters found for each def. If a def
	+ // has not yet been analyzed, then it will not appear in the map. If a def
	+ // has been analyzed and was determined not to have any transmitters, then
	+ // its list of transmitters will be empty.
	+ DenseMap<NodeId, std::vector<NodeId>> Transmitters;
	+
	+ // Analyze all machine instructions to find gadgets and LFENCEs, adding
	+ // each interesting value to `Nodes`
	+ auto AnalyzeDef = [&](NodeAddr<DefNode *> SourceDef) {
	+ SmallSet<NodeId, 8> UsesVisited, DefsVisited;
	+ std::function<void(NodeAddr<DefNode *>)> AnalyzeDefUseChain =
	+ [&](NodeAddr<DefNode *> Def) {
	+ if (Transmitters.find(Def.Id) != Transmitters.end())
	+ return; // Already analyzed `Def`
	+
	+ // Use RDF to find all the uses of `Def`
	+ rdf::NodeSet Uses;
	+ RegisterRef DefReg = DFG.getPRI().normalize(Def.Addr->getRegRef(DFG));
	+ for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
	+ auto Use = DFG.addr<UseNode *>(UseID);
	+ if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
	+ NodeAddr<PhiNode *> Phi = Use.Addr->getOwner(DFG);
	+ for (auto I : L.getRealUses(Phi.Id)) {
	+ if (DFG.getPRI().alias(RegisterRef(I.first), DefReg)) {
	+ for (auto UA : I.second)
	+ Uses.emplace(UA.first);
	+ }
	+ }
	+ } else { // not a phi node
	+ Uses.emplace(UseID);
	+ }
	+ }
	+
	+ // For each use of `Def`, we want to know whether:
	+ // (1) The use can leak the Def'ed value,
	+ // (2) The use can further propagate the Def'ed value to more defs
	+ for (auto UseID : Uses) {
	+ if (!UsesVisited.insert(UseID).second)
	+ continue; // Already visited this use of `Def`
	+
	+ auto Use = DFG.addr<UseNode *>(UseID);
	+ assert(!(Use.Addr->getFlags() & NodeAttrs::PhiRef));
	+ MachineOperand &UseMO = Use.Addr->getOp();
	+ MachineInstr &UseMI = *UseMO.getParent();
	+ assert(UseMO.isReg());
	+
	+ // We naively assume that an instruction propagates any loaded
	+ // uses to all defs unless the instruction is a call, in which
	+ // case all arguments will be treated as gadget sources during
	+ // analysis of the callee function.
	+ if (UseMI.isCall())
	+ continue;
	+
	+ // Check whether this use can transmit (leak) its value.
	+ if (instrUsesRegToAccessMemory(UseMI, UseMO.getReg()) \|\|
	+ (!NoConditionalBranches &&
	+ instrUsesRegToBranch(UseMI, UseMO.getReg()))) {
	+ Transmitters[Def.Id].push_back(Use.Addr->getOwner(DFG).Id);
	+ if (UseMI.mayLoad())
	+ continue; // Found a transmitting load -- no need to continue
	+ // traversing its defs (i.e., this load will become
	+ // a new gadget source anyways).
	+ }
	+
	+ // Check whether the use propagates to more defs.
	+ NodeAddr<InstrNode *> Owner{Use.Addr->getOwner(DFG)};
	+ rdf::NodeList AnalyzedChildDefs;
	+ for (auto &ChildDef :
	+ Owner.Addr->members_if(DataFlowGraph::IsDef, DFG)) {
	+ if (!DefsVisited.insert(ChildDef.Id).second)
	+ continue; // Already visited this def
	+ if (Def.Addr->getAttrs() & NodeAttrs::Dead)
	+ continue;
	+ if (Def.Id == ChildDef.Id)
	+ continue; // `Def` uses itself (e.g., increment loop counter)
	+
	+ AnalyzeDefUseChain(ChildDef);
	+
	+ // `Def` inherits all of its child defs' transmitters.
	+ for (auto TransmitterId : Transmitters[ChildDef.Id])
	+ Transmitters[Def.Id].push_back(TransmitterId);
	+ }
	+ }
	+
	+ // Note that this statement adds `Def.Id` to the map if no
	+ // transmitters were found for `Def`.
	+ auto &DefTransmitters = Transmitters[Def.Id];
	+
	+ // Remove duplicate transmitters
	+ llvm::sort(DefTransmitters);
	+ DefTransmitters.erase(
	+ std::unique(DefTransmitters.begin(), DefTransmitters.end()),
	+ DefTransmitters.end());
	+ };
	+
	+ // Find all of the transmitters
	+ AnalyzeDefUseChain(SourceDef);
	+ auto &SourceDefTransmitters = Transmitters[SourceDef.Id];
	+ if (SourceDefTransmitters.empty())
	+ return; // No transmitters for `SourceDef`
	+
	+ MachineInstr *Source = SourceDef.Addr->getFlags() & NodeAttrs::PhiRef
	+ ? MachineGadgetGraph::ArgNodeSentinel
	+ : SourceDef.Addr->getOp().getParent();
	+ auto GadgetSource = MaybeAddNode(Source);
	+ // Each transmitter is a sink for `SourceDef`.
	+ for (auto TransmitterId : SourceDefTransmitters) {
	+ MachineInstr Sink = DFG.addr<StmtNode >(TransmitterId).Addr->getCode();
	+ auto GadgetSink = MaybeAddNode(Sink);
	+ // Add the gadget edge to the graph.
	+ Builder.addEdge(MachineGadgetGraph::GadgetEdgeSentinel,
	+ GadgetSource.first, GadgetSink.first);
	+ ++GadgetCount;
	+ }
	+ };
	+
	+ LLVM_DEBUG(dbgs() << "Analyzing def-use chains to find gadgets\n");
	+ // Analyze function arguments
	+ NodeAddr<BlockNode *> EntryBlock = DFG.getFunc().Addr->getEntryBlock(DFG);
	+ for (NodeAddr<PhiNode *> ArgPhi :
	+ EntryBlock.Addr->members_if(DataFlowGraph::IsPhi, DFG)) {
	+ NodeList Defs = ArgPhi.Addr->members_if(DataFlowGraph::IsDef, DFG);
	+ llvm::for_each(Defs, AnalyzeDef);
	+ }
	+ // Analyze every instruction in MF
	+ for (NodeAddr<BlockNode *> BA : DFG.getFunc().Addr->members(DFG)) {
	+ for (NodeAddr<StmtNode *> SA :
	+ BA.Addr->members_if(DataFlowGraph::IsCode<NodeAttrs::Stmt>, DFG)) {
	+ MachineInstr *MI = SA.Addr->getCode();
	+ if (isFence(MI)) {
	+ MaybeAddNode(MI);
	+ ++FenceCount;
	+ } else if (MI->mayLoad()) {
	+ NodeList Defs = SA.Addr->members_if(DataFlowGraph::IsDef, DFG);
	+ llvm::for_each(Defs, AnalyzeDef);
	+ }
	+ }
	+ }
	+ LLVM_DEBUG(dbgs() << "Found " << FenceCount << " fences\n");
	+ LLVM_DEBUG(dbgs() << "Found " << GadgetCount << " gadgets\n");
	+ if (GadgetCount == 0)
	+ return nullptr;
	+ NumGadgets += GadgetCount;
	+
	+ // Traverse CFG to build the rest of the graph
	+ SmallSet<MachineBasicBlock *, 8> BlocksVisited;
	+ std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
	+ [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
	+ unsigned LoopDepth = MLI.getLoopDepth(MBB);
	+ if (!MBB->empty()) {
	+ // Always add the first instruction in each block
	+ auto NI = MBB->begin();
	+ auto BeginBB = MaybeAddNode(&*NI);
	+ Builder.addEdge(ParentDepth, GI, BeginBB.first);
	+ if (!BlocksVisited.insert(MBB).second)
	+ return;
	+
	+ // Add any instructions within the block that are gadget components
	+ GI = BeginBB.first;
	+ while (++NI != MBB->end()) {
	+ auto Ref = NodeMap.find(&*NI);
	+ if (Ref != NodeMap.end()) {
	+ Builder.addEdge(LoopDepth, GI, Ref->getSecond());
	+ GI = Ref->getSecond();
	+ }
	+ }
	+
	+ // Always add the terminator instruction, if one exists
	+ auto T = MBB->getFirstTerminator();
	+ if (T != MBB->end()) {
	+ auto EndBB = MaybeAddNode(&*T);
	+ if (EndBB.second)
	+ Builder.addEdge(LoopDepth, GI, EndBB.first);
	+ GI = EndBB.first;
	+ }
	+ }
	+ for (MachineBasicBlock *Succ : MBB->successors())
	+ TraverseCFG(Succ, GI, LoopDepth);
	+ };
	+ // ArgNodeSentinel is a pseudo-instruction that represents MF args in the
	+ // GadgetGraph
	+ GraphIter ArgNode = MaybeAddNode(MachineGadgetGraph::ArgNodeSentinel).first;
	+ TraverseCFG(&MF.front(), ArgNode, 0);
	+ std::unique_ptr<MachineGadgetGraph> G{Builder.get(FenceCount, GadgetCount)};
	+ LLVM_DEBUG(dbgs() << "Found " << G->nodes_size() << " nodes\n");
	+ return G;
	+}
	+
	+// Returns the number of remaining gadget edges that could not be eliminated
	+int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
	+ MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */,
	+ MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const {
	+ if (G.NumFences > 0) {
	+ // Eliminate fences and CFG edges that ingress and egress the fence, as
	+ // they are trivially mitigated.
	+ for (const auto &E : G.edges()) {
	+ const MachineGadgetGraph::Node *Dest = E.getDest();
	+ if (isFence(Dest->getValue())) {
	+ ElimNodes.insert(*Dest);
	+ ElimEdges.insert(E);
	+ for (const auto &DE : Dest->edges())
	+ ElimEdges.insert(DE);
	+ }
	+ }
	+ }
	+
	+ // Find and eliminate gadget edges that have been mitigated.
	+ int MitigatedGadgets = 0, RemainingGadgets = 0;
	+ MachineGadgetGraph::NodeSet ReachableNodes{G};
	+ for (const auto &RootN : G.nodes()) {
	+ if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
	+ continue; // skip this node if it isn't a gadget source
	+
	+ // Find all of the nodes that are CFG-reachable from RootN using DFS
	+ ReachableNodes.clear();
	+ std::function<void(const MachineGadgetGraph::Node *, bool)>
	+ FindReachableNodes =
	+ [&](const MachineGadgetGraph::Node *N, bool FirstNode) {
	+ if (!FirstNode)
	+ ReachableNodes.insert(*N);
	+ for (const auto &E : N->edges()) {
	+ const MachineGadgetGraph::Node *Dest = E.getDest();
	+ if (MachineGadgetGraph::isCFGEdge(E) &&
	+ !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest))
	+ FindReachableNodes(Dest, false);
	+ }
	+ };
	+ FindReachableNodes(&RootN, true);
	+
	+ // Any gadget whose sink is unreachable has been mitigated
	+ for (const auto &E : RootN.edges()) {
	+ if (MachineGadgetGraph::isGadgetEdge(E)) {
	+ if (ReachableNodes.contains(*E.getDest())) {
	+ // This gadget's sink is reachable
	+ ++RemainingGadgets;
	+ } else { // This gadget's sink is unreachable, and therefore mitigated
	+ ++MitigatedGadgets;
	+ ElimEdges.insert(E);
	+ }
	+ }
	+ }
	+ }
	+ return RemainingGadgets;
	+}
	+
	+std::unique_ptr<MachineGadgetGraph>
	+X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
	+ std::unique_ptr<MachineGadgetGraph> Graph) const {
	+ MachineGadgetGraph::NodeSet ElimNodes{*Graph};
	+ MachineGadgetGraph::EdgeSet ElimEdges{*Graph};
	+ int RemainingGadgets =
	+ elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
	+ if (ElimEdges.empty() && ElimNodes.empty()) {
	+ Graph->NumFences = 0;
	+ Graph->NumGadgets = RemainingGadgets;
	+ } else {
	+ Graph = GraphBuilder::trim(Graph, ElimNodes, ElimEdges, 0 / NumFences */,
	+ RemainingGadgets);
	+ }
	+ return Graph;
	+}
	+
	+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
	+ MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
	+ int FencesInserted = 0;
	+
	+ do {
	+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
	+ Graph = trimMitigatedEdges(std::move(Graph));
	+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
	+ if (Graph->NumGadgets == 0)
	+ break;
	+
	+ LLVM_DEBUG(dbgs() << "Cutting edges...\n");
	+ EdgeSet CutEdges{*Graph};
	+ auto Nodes = std::make_unique<unsigned int[]>(Graph->nodes_size() +
	+ 1 /* terminator node */);
	+ auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
	+ auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
	+ auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
	+ for (const auto &N : Graph->nodes()) {
	+ Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
	+ }
	+ Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
	+ for (const auto &E : Graph->edges()) {
	+ Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
	+ EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
	+ }
	+ OptimizeCut(Nodes.get(), Graph->nodes_size(), Edges.get(), EdgeValues.get(),
	+ EdgeCuts.get(), Graph->edges_size());
	+ for (int I = 0; I < Graph->edges_size(); ++I)
	+ if (EdgeCuts[I])
	+ CutEdges.set(I);
	+ LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
	+ LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
	+
	+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
	+ FencesInserted += insertFences(MF, *Graph, CutEdges);
	+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
	+ LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
	+
	+ Graph = GraphBuilder::trim(Graph, MachineGadgetGraph::NodeSet{Graph},
	+ CutEdges);
	+ } while (true);
	+
	+ return FencesInserted;
	+}
	+
	+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic(
	+ MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
	+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
	+ Graph = trimMitigatedEdges(std::move(Graph));
	+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
	+ if (Graph->NumGadgets == 0)
	+ return 0;
	+
	+ LLVM_DEBUG(dbgs() << "Cutting edges...\n");
	+ MachineGadgetGraph::NodeSet ElimNodes{Graph}, GadgetSinks{Graph};
	+ MachineGadgetGraph::EdgeSet ElimEdges{Graph}, CutEdges{Graph};
	+ auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) {
	+ return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
	+ MachineGadgetGraph::isCFGEdge(E);
	+ };
	+ auto IsGadgetEdge = [&ElimEdges,
	+ &CutEdges](const MachineGadgetGraph::Edge &E) {
	+ return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
	+ MachineGadgetGraph::isGadgetEdge(E);
	+ };
	+
	+ // FIXME: this is O(E^2), we could probably do better.
	+ do {
	+ // Find the cheapest CFG edge that will eliminate a gadget (by being
	+ // egress from a SOURCE node or ingress to a SINK node), and cut it.
	+ const MachineGadgetGraph::Edge *CheapestSoFar = nullptr;
	+
	+ // First, collect all gadget source and sink nodes.
	+ MachineGadgetGraph::NodeSet GadgetSources{Graph}, GadgetSinks{Graph};
	+ for (const auto &N : Graph->nodes()) {
	+ if (ElimNodes.contains(N))
	+ continue;
	+ for (const auto &E : N.edges()) {
	+ if (IsGadgetEdge(E)) {
	+ GadgetSources.insert(N);
	+ GadgetSinks.insert(*E.getDest());
	+ }
	+ }
	+ }
	+
	+ // Next, look for the cheapest CFG edge which, when cut, is guaranteed to
	+ // mitigate at least one gadget by either:
	+ // (a) being egress from a gadget source, or
	+ // (b) being ingress to a gadget sink.
	+ for (const auto &N : Graph->nodes()) {
	+ if (ElimNodes.contains(N))
	+ continue;
	+ for (const auto &E : N.edges()) {
	+ if (IsCFGEdge(E)) {
	+ if (GadgetSources.contains(N) \|\| GadgetSinks.contains(*E.getDest())) {
	+ if (!CheapestSoFar \|\| E.getValue() < CheapestSoFar->getValue())
	+ CheapestSoFar = &E;
	+ }
	+ }
	+ }
	+ }
	+
	+ assert(CheapestSoFar && "Failed to cut an edge");
	+ CutEdges.insert(*CheapestSoFar);
	+ ElimEdges.insert(*CheapestSoFar);
	+ } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes));
	+ LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
	+ LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
	+
	+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
	+ int FencesInserted = insertFences(MF, *Graph, CutEdges);
	+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
	+ LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
	+
	+ return FencesInserted;
	+}
	+
	+int X86LoadValueInjectionLoadHardeningPass::insertFences(
	+ MachineFunction &MF, MachineGadgetGraph &G,
	+ EdgeSet &CutEdges /* in, out */) const {
	+ int FencesInserted = 0;
	+ for (const auto &N : G.nodes()) {
	+ for (const auto &E : N.edges()) {
	+ if (CutEdges.contains(E)) {
	+ MachineInstr MI = N.getValue(), Prev;
	+ MachineBasicBlock *MBB; // Insert an LFENCE in this MBB
	+ MachineBasicBlock::iterator InsertionPt; // ...at this point
	+ if (MI == MachineGadgetGraph::ArgNodeSentinel) {
	+ // insert LFENCE at beginning of entry block
	+ MBB = &MF.front();
	+ InsertionPt = MBB->begin();
	+ Prev = nullptr;
	+ } else if (MI->isBranch()) { // insert the LFENCE before the branch
	+ MBB = MI->getParent();
	+ InsertionPt = MI;
	+ Prev = MI->getPrevNode();
	+ // Remove all egress CFG edges from this branch because the inserted
	+ // LFENCE prevents gadgets from crossing the branch.
	+ for (const auto &E : N.edges()) {
	+ if (MachineGadgetGraph::isCFGEdge(E))
	+ CutEdges.insert(E);
	+ }
	+ } else { // insert the LFENCE after the instruction
	+ MBB = MI->getParent();
	+ InsertionPt = MI->getNextNode() ? MI->getNextNode() : MBB->end();
	+ Prev = InsertionPt == MBB->end()
	+ ? (MBB->empty() ? nullptr : &MBB->back())
	+ : InsertionPt->getPrevNode();
	+ }
	+ // Ensure this insertion is not redundant (two LFENCEs in sequence).
	+ if ((InsertionPt == MBB->end() \|\| !isFence(&*InsertionPt)) &&
	+ (!Prev \|\| !isFence(Prev))) {
	+ BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
	+ ++FencesInserted;
	+ }
	+ }
	+ }
	+ }
	+ return FencesInserted;
	+}
	+
	+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToAccessMemory(
	+ const MachineInstr &MI, unsigned Reg) const {
	+ if (!MI.mayLoadOrStore() \|\| MI.getOpcode() == X86::MFENCE \|\|
	+ MI.getOpcode() == X86::SFENCE \|\| MI.getOpcode() == X86::LFENCE)
	+ return false;
	+
	+ // FIXME: This does not handle pseudo loading instruction like TCRETURN*
	+ const MCInstrDesc &Desc = MI.getDesc();
	+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
	+ if (MemRefBeginIdx < 0) {
	+ LLVM_DEBUG(dbgs() << "Warning: unable to obtain memory operand for loading "
	+ "instruction:\n";
	+ MI.print(dbgs()); dbgs() << '\n';);
	+ return false;
	+ }
	+ MemRefBeginIdx += X86II::getOperandBias(Desc);
	+
	+ const MachineOperand &BaseMO =
	+ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
	+ const MachineOperand &IndexMO =
	+ MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
	+ return (BaseMO.isReg() && BaseMO.getReg() != X86::NoRegister &&
	+ TRI->regsOverlap(BaseMO.getReg(), Reg)) \|\|
	+ (IndexMO.isReg() && IndexMO.getReg() != X86::NoRegister &&
	+ TRI->regsOverlap(IndexMO.getReg(), Reg));
	+}
	+
	+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch(
	+ const MachineInstr &MI, unsigned Reg) const {
	+ if (!MI.isConditionalBranch())
	+ return false;
	+ for (const MachineOperand &Use : MI.uses())
	+ if (Use.isReg() && Use.getReg() == Reg)
	+ return true;
	+ return false;
	+}
	+
	+INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
	+ "X86 LVI load hardening", false, false)
	+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
	+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
	+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
	+INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
	+ "X86 LVI load hardening", false, false)
	+
	+FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() {
	+ return new X86LoadValueInjectionLoadHardeningPass();
	+}
	+
	+namespace {
	+
	+/// The `X86LoadValueInjectionLoadHardeningPass` above depends on expensive
	+/// analysis passes that add complexity to the pipeline. This complexity
	+/// can cause noticable overhead when no optimizations are enabled, i.e., -O0.
	+/// The purpose of `X86LoadValueInjectionLoadHardeningUnoptimizedPass` is to
	+/// provide the same security as the optimized pass, but without adding
	+/// unnecessary complexity to the LLVM pipeline.
	+///
	+/// The behavior of this pass is simply to insert an LFENCE after every load
	+/// instruction.
	+class X86LoadValueInjectionLoadHardeningUnoptimizedPass
	+ : public MachineFunctionPass {
	+public:
	+ X86LoadValueInjectionLoadHardeningUnoptimizedPass()
	+ : MachineFunctionPass(ID) {}
	+
	+ StringRef getPassName() const override {
	+ return "X86 Load Value Injection (LVI) Load Hardening (Unoptimized)";
	+ }
	+ bool runOnMachineFunction(MachineFunction &MF) override;
	+ static char ID;
	+};
	+
	+} // end anonymous namespace
	+
	+char X86LoadValueInjectionLoadHardeningUnoptimizedPass::ID = 0;
	+
	+bool X86LoadValueInjectionLoadHardeningUnoptimizedPass::runOnMachineFunction(
	+ MachineFunction &MF) {
	+ LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
	+ << " *****\n");
	+ const X86Subtarget *STI = &MF.getSubtarget<X86Subtarget>();
	+ if (!STI->useLVILoadHardening())
	+ return false;
	+
	+ // FIXME: support 32-bit
	+ if (!STI->is64Bit())
	+ report_fatal_error("LVI load hardening is only supported on 64-bit", false);
	+
	+ // Don't skip functions with the "optnone" attr but participate in opt-bisect.
	+ const Function &F = MF.getFunction();
	+ if (!F.hasOptNone() && skipFunction(F))
	+ return false;
	+
	+ bool Modified = false;
	+ ++NumFunctionsConsidered;
	+
	+ const TargetInstrInfo *TII = STI->getInstrInfo();
	+ for (auto &MBB : MF) {
	+ for (auto &MI : MBB) {
	+ if (!MI.mayLoad() \|\| MI.getOpcode() == X86::LFENCE \|\|
	+ MI.getOpcode() == X86::MFENCE)
	+ continue;
	+
	+ MachineBasicBlock::iterator InsertionPt =
	+ MI.getNextNode() ? MI.getNextNode() : MBB.end();
	+ BuildMI(MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
	+ ++NumFences;
	+ Modified = true;
	+ }
	+ }
	+
	+ if (Modified)
	+ ++NumFunctionsMitigated;
	+
	+ return Modified;
	+}
	+
	+INITIALIZE_PASS(X86LoadValueInjectionLoadHardeningUnoptimizedPass, PASS_KEY,
	+ "X86 LVI load hardening", false, false)
	+
	+FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningUnoptimizedPass() {
	+ return new X86LoadValueInjectionLoadHardeningUnoptimizedPass();
	+}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp (nonexistent)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp (revision 362609)
	@@ -0,0 +1,143 @@
	+//===-- X86LoadValueInjectionRetHardening.cpp - LVI RET hardening for x86 --==//
	+//
	+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	+// See https://llvm.org/LICENSE.txt for license information.
	+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	+//
	+//===----------------------------------------------------------------------===//
	+///
	+/// Description: Replaces every `ret` instruction with the sequence:
	+/// ```
	+/// pop <scratch-reg>
	+/// lfence
	+/// jmp *<scratch-reg>
	+/// ```
	+/// where `<scratch-reg>` is some available scratch register, according to the
	+/// calling convention of the function being mitigated.
	+///
	+//===----------------------------------------------------------------------===//
	+
	+#include "X86.h"
	+#include "X86InstrBuilder.h"
	+#include "X86Subtarget.h"
	+#include "llvm/ADT/Statistic.h"
	+#include "llvm/CodeGen/MachineBasicBlock.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineFunctionPass.h"
	+#include "llvm/CodeGen/MachineInstrBuilder.h"
	+#include "llvm/IR/Function.h"
	+#include "llvm/Support/Debug.h"
	+#include <bitset>
	+
	+using namespace llvm;
	+
	+#define PASS_KEY "x86-lvi-ret"
	+#define DEBUG_TYPE PASS_KEY
	+
	+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
	+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
	+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
	+ "were deployed");
	+
	+namespace {
	+
	+class X86LoadValueInjectionRetHardeningPass : public MachineFunctionPass {
	+public:
	+ X86LoadValueInjectionRetHardeningPass() : MachineFunctionPass(ID) {}
	+ StringRef getPassName() const override {
	+ return "X86 Load Value Injection (LVI) Ret-Hardening";
	+ }
	+ bool runOnMachineFunction(MachineFunction &MF) override;
	+
	+ static char ID;
	+};
	+
	+} // end anonymous namespace
	+
	+char X86LoadValueInjectionRetHardeningPass::ID = 0;
	+
	+bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
	+ MachineFunction &MF) {
	+ LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
	+ << " *****\n");
	+ const X86Subtarget *Subtarget = &MF.getSubtarget<X86Subtarget>();
	+ if (!Subtarget->useLVIControlFlowIntegrity() \|\| !Subtarget->is64Bit())
	+ return false; // FIXME: support 32-bit
	+
	+ // Don't skip functions with the "optnone" attr but participate in opt-bisect.
	+ const Function &F = MF.getFunction();
	+ if (!F.hasOptNone() && skipFunction(F))
	+ return false;
	+
	+ ++NumFunctionsConsidered;
	+ const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
	+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
	+ unsigned ClobberReg = X86::NoRegister;
	+ std::bitset<X86::NUM_TARGET_REGS> UnclobberableGR64s;
	+ UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer
	+ UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer
	+ UnclobberableGR64s.set(X86::RAX); // used for function return
	+ UnclobberableGR64s.set(X86::RDX); // used for function return
	+
	+ // We can clobber any register allowed by the function's calling convention.
	+ for (const MCPhysReg PR = TRI->getCalleeSavedRegs(&MF); auto Reg = PR; ++PR)
	+ UnclobberableGR64s.set(Reg);
	+ for (auto &Reg : X86::GR64RegClass) {
	+ if (!UnclobberableGR64s.test(Reg)) {
	+ ClobberReg = Reg;
	+ break;
	+ }
	+ }
	+
	+ if (ClobberReg != X86::NoRegister) {
	+ LLVM_DEBUG(dbgs() << "Selected register "
	+ << Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg)
	+ << " to clobber\n");
	+ } else {
	+ LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n");
	+ }
	+
	+ bool Modified = false;
	+ for (auto &MBB : MF) {
	+ if (MBB.empty())
	+ continue;
	+
	+ MachineInstr &MI = MBB.back();
	+ if (MI.getOpcode() != X86::RETQ)
	+ continue;
	+
	+ if (ClobberReg != X86::NoRegister) {
	+ MBB.erase_instr(&MI);
	+ BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r))
	+ .addReg(ClobberReg, RegState::Define)
	+ .setMIFlag(MachineInstr::FrameDestroy);
	+ BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE));
	+ BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r))
	+ .addReg(ClobberReg);
	+ } else {
	+ // In case there is no available scratch register, we can still read from
	+ // RSP to assert that RSP points to a valid page. The write to RSP is
	+ // also helpful because it verifies that the stack's write permissions
	+ // are intact.
	+ MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
	+ addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
	+ X86::RSP, false, 0)
	+ .addImm(0)
	+ ->addRegisterDead(X86::EFLAGS, TRI);
	+ }
	+
	+ ++NumFences;
	+ Modified = true;
	+ }
	+
	+ if (Modified)
	+ ++NumFunctionsMitigated;
	+ return Modified;
	+}
	+
	+INITIALIZE_PASS(X86LoadValueInjectionRetHardeningPass, PASS_KEY,
	+ "X86 LVI ret hardener", false, false)
	+
	+FunctionPass *llvm::createX86LoadValueInjectionRetHardeningPass() {
	+ return new X86LoadValueInjectionRetHardeningPass();
	+}

	Property changes on: head/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 362609)
	@@ -1,2660 +1,2660 @@
	//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains code to lower X86 MachineInstrs to their corresponding
	// MCInst records.
	//
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/X86ATTInstPrinter.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "MCTargetDesc/X86InstComments.h"
	#include "MCTargetDesc/X86TargetStreamer.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86AsmPrinter.h"
	#include "X86RegisterInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineModuleInfoImpls.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCFixup.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstBuilder.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCSymbolELF.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"

	using namespace llvm;

	namespace {

	/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
	class X86MCInstLower {
	MCContext &Ctx;
	const MachineFunction &MF;
	const TargetMachine &TM;
	const MCAsmInfo &MAI;
	X86AsmPrinter &AsmPrinter;

	public:
	X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);

	Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
	const MachineOperand &MO) const;
	void Lower(const MachineInstr *MI, MCInst &OutMI) const;

	MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
	MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;

	private:
	MachineModuleInfoMachO &getMachOMMI() const;
	};

	} // end anonymous namespace

	// Emit a minimal sequence of nops spanning NumBytes bytes.
	static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI);

	void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
	const MCSubtargetInfo &STI,
	MCCodeEmitter *CodeEmitter) {
	if (InShadow) {
	SmallString<256> Code;
	SmallVector<MCFixup, 4> Fixups;
	raw_svector_ostream VecOS(Code);
	CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
	CurrentShadowSize += Code.size();
	if (CurrentShadowSize >= RequiredShadowSize)
	InShadow = false; // The shadow is big enough. Stop counting.
	}
	}

	void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
	MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
	if (InShadow && CurrentShadowSize < RequiredShadowSize) {
	InShadow = false;
	EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
	MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
	}
	}

	void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
	OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
	SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
	}

	X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
	X86AsmPrinter &asmprinter)
	: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
	AsmPrinter(asmprinter) {}

	MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
	return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
	}

	/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
	/// operand to an MCSymbol.
	MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
	const DataLayout &DL = MF.getDataLayout();
	assert((MO.isGlobal() \|\| MO.isSymbol() \|\| MO.isMBB()) &&
	"Isn't a symbol reference");

	MCSymbol *Sym = nullptr;
	SmallString<128> Name;
	StringRef Suffix;

	switch (MO.getTargetFlags()) {
	case X86II::MO_DLLIMPORT:
	// Handle dllimport linkage.
	Name += "__imp_";
	break;
	case X86II::MO_COFFSTUB:
	Name += ".refptr.";
	break;
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	Suffix = "$non_lazy_ptr";
	break;
	}

	if (!Suffix.empty())
	Name += DL.getPrivateGlobalPrefix();

	if (MO.isGlobal()) {
	const GlobalValue *GV = MO.getGlobal();
	AsmPrinter.getNameWithPrefix(Name, GV);
	} else if (MO.isSymbol()) {
	Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
	} else if (MO.isMBB()) {
	assert(Suffix.empty());
	Sym = MO.getMBB()->getSymbol();
	}

	Name += Suffix;
	if (!Sym)
	Sym = Ctx.getOrCreateSymbol(Name);

	// If the target flags on the operand changes the name of the symbol, do that
	// before we return the symbol.
	switch (MO.getTargetFlags()) {
	default:
	break;
	case X86II::MO_COFFSTUB: {
	MachineModuleInfoCOFF &MMICOFF =
	MF.getMMI().getObjFileInfo<MachineModuleInfoCOFF>();
	MachineModuleInfoImpl::StubValueTy &StubSym = MMICOFF.getGVStubEntry(Sym);
	if (!StubSym.getPointer()) {
	assert(MO.isGlobal() && "Extern symbol not handled yet");
	StubSym = MachineModuleInfoImpl::StubValueTy(
	AsmPrinter.getSymbol(MO.getGlobal()), true);
	}
	break;
	}
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
	MachineModuleInfoImpl::StubValueTy &StubSym =
	getMachOMMI().getGVStubEntry(Sym);
	if (!StubSym.getPointer()) {
	assert(MO.isGlobal() && "Extern symbol not handled yet");
	StubSym = MachineModuleInfoImpl::StubValueTy(
	AsmPrinter.getSymbol(MO.getGlobal()),
	!MO.getGlobal()->hasInternalLinkage());
	}
	break;
	}
	}

	return Sym;
	}

	MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
	MCSymbol *Sym) const {
	// FIXME: We would like an efficient form for this, so we don't have to do a
	// lot of extra uniquing.
	const MCExpr *Expr = nullptr;
	MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;

	switch (MO.getTargetFlags()) {
	default:
	llvm_unreachable("Unknown target flag on GV operand");
	case X86II::MO_NO_FLAG: // No flag.
	// These affect the name of the symbol, not any suffix.
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DLLIMPORT:
	case X86II::MO_COFFSTUB:
	break;

	case X86II::MO_TLVP:
	RefKind = MCSymbolRefExpr::VK_TLVP;
	break;
	case X86II::MO_TLVP_PIC_BASE:
	Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
	// Subtract the pic base.
	Expr = MCBinaryExpr::createSub(
	Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
	break;
	case X86II::MO_SECREL:
	RefKind = MCSymbolRefExpr::VK_SECREL;
	break;
	case X86II::MO_TLSGD:
	RefKind = MCSymbolRefExpr::VK_TLSGD;
	break;
	case X86II::MO_TLSLD:
	RefKind = MCSymbolRefExpr::VK_TLSLD;
	break;
	case X86II::MO_TLSLDM:
	RefKind = MCSymbolRefExpr::VK_TLSLDM;
	break;
	case X86II::MO_GOTTPOFF:
	RefKind = MCSymbolRefExpr::VK_GOTTPOFF;
	break;
	case X86II::MO_INDNTPOFF:
	RefKind = MCSymbolRefExpr::VK_INDNTPOFF;
	break;
	case X86II::MO_TPOFF:
	RefKind = MCSymbolRefExpr::VK_TPOFF;
	break;
	case X86II::MO_DTPOFF:
	RefKind = MCSymbolRefExpr::VK_DTPOFF;
	break;
	case X86II::MO_NTPOFF:
	RefKind = MCSymbolRefExpr::VK_NTPOFF;
	break;
	case X86II::MO_GOTNTPOFF:
	RefKind = MCSymbolRefExpr::VK_GOTNTPOFF;
	break;
	case X86II::MO_GOTPCREL:
	RefKind = MCSymbolRefExpr::VK_GOTPCREL;
	break;
	case X86II::MO_GOT:
	RefKind = MCSymbolRefExpr::VK_GOT;
	break;
	case X86II::MO_GOTOFF:
	RefKind = MCSymbolRefExpr::VK_GOTOFF;
	break;
	case X86II::MO_PLT:
	RefKind = MCSymbolRefExpr::VK_PLT;
	break;
	case X86II::MO_ABS8:
	RefKind = MCSymbolRefExpr::VK_X86_ABS8;
	break;
	case X86II::MO_PIC_BASE_OFFSET:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	Expr = MCSymbolRefExpr::create(Sym, Ctx);
	// Subtract the pic base.
	Expr = MCBinaryExpr::createSub(
	Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
	if (MO.isJTI()) {
	assert(MAI.doesSetDirectiveSuppressReloc());
	// If .set directive is supported, use it to reduce the number of
	// relocations the assembler will generate for differences between
	// local labels. This is only safe when the symbols are in the same
	// section so we are restricting it to jumptable references.
	MCSymbol *Label = Ctx.createTempSymbol();
	AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
	Expr = MCSymbolRefExpr::create(Label, Ctx);
	}
	break;
	}

	if (!Expr)
	Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);

	if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
	Expr = MCBinaryExpr::createAdd(
	Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
	return MCOperand::createExpr(Expr);
	}

	/// Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
	/// a short fixed-register form.
	static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
	unsigned ImmOp = Inst.getNumOperands() - 1;
	assert(Inst.getOperand(0).isReg() &&
	(Inst.getOperand(ImmOp).isImm() \|\| Inst.getOperand(ImmOp).isExpr()) &&
	((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
	Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) \|\|
	Inst.getNumOperands() == 2) &&
	"Unexpected instruction!");

	// Check whether the destination register can be fixed.
	unsigned Reg = Inst.getOperand(0).getReg();
	if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
	return;

	// If so, rewrite the instruction.
	MCOperand Saved = Inst.getOperand(ImmOp);
	Inst = MCInst();
	Inst.setOpcode(Opcode);
	Inst.addOperand(Saved);
	}

	/// If a movsx instruction has a shorter encoding for the used register
	/// simplify the instruction to use it instead.
	static void SimplifyMOVSX(MCInst &Inst) {
	unsigned NewOpcode = 0;
	unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instruction!");
	case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
	if (Op0 == X86::AX && Op1 == X86::AL)
	NewOpcode = X86::CBW;
	break;
	case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
	if (Op0 == X86::EAX && Op1 == X86::AX)
	NewOpcode = X86::CWDE;
	break;
	case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
	if (Op0 == X86::RAX && Op1 == X86::EAX)
	NewOpcode = X86::CDQE;
	break;
	}

	if (NewOpcode != 0) {
	Inst = MCInst();
	Inst.setOpcode(NewOpcode);
	}
	}

	/// Simplify things like MOV32rm to MOV32o32a.
	static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
	unsigned Opcode) {
	// Don't make these simplifications in 64-bit mode; other assemblers don't
	// perform them because they make the code larger.
	if (Printer.getSubtarget().is64Bit())
	return;

	bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
	unsigned AddrBase = IsStore;
	unsigned RegOp = IsStore ? 0 : 5;
	unsigned AddrOp = AddrBase + 3;
	assert(
	Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
	Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
	(Inst.getOperand(AddrOp).isExpr() \|\| Inst.getOperand(AddrOp).isImm()) &&
	"Unexpected instruction!");

	// Check whether the destination register can be fixed.
	unsigned Reg = Inst.getOperand(RegOp).getReg();
	if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
	return;

	// Check whether this is an absolute address.
	// FIXME: We know TLVP symbol refs aren't, but there should be a better way
	// to do this here.
	bool Absolute = true;
	if (Inst.getOperand(AddrOp).isExpr()) {
	const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
	if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
	if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
	Absolute = false;
	}

	if (Absolute &&
	(Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 \|\|
	Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 \|\|
	Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
	return;

	// If so, rewrite the instruction.
	MCOperand Saved = Inst.getOperand(AddrOp);
	MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
	Inst = MCInst();
	Inst.setOpcode(Opcode);
	Inst.addOperand(Saved);
	Inst.addOperand(Seg);
	}

	static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
	return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
	}

	Optional<MCOperand>
	X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
	const MachineOperand &MO) const {
	switch (MO.getType()) {
	default:
	MI->print(errs());
	llvm_unreachable("unknown operand type");
	case MachineOperand::MO_Register:
	// Ignore all implicit register operands.
	if (MO.isImplicit())
	return None;
	return MCOperand::createReg(MO.getReg());
	case MachineOperand::MO_Immediate:
	return MCOperand::createImm(MO.getImm());
	case MachineOperand::MO_MachineBasicBlock:
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
	case MachineOperand::MO_MCSymbol:
	return LowerSymbolOperand(MO, MO.getMCSymbol());
	case MachineOperand::MO_JumpTableIndex:
	return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
	case MachineOperand::MO_ConstantPoolIndex:
	return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
	case MachineOperand::MO_BlockAddress:
	return LowerSymbolOperand(
	MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
	case MachineOperand::MO_RegisterMask:
	// Ignore call clobbers.
	return None;
	}
	}

	// Replace TAILJMP opcodes with their equivalent opcodes that have encoding
	// information.
	static unsigned convertTailJumpOpcode(unsigned Opcode) {
	switch (Opcode) {
	case X86::TAILJMPr:
	Opcode = X86::JMP32r;
	break;
	case X86::TAILJMPm:
	Opcode = X86::JMP32m;
	break;
	case X86::TAILJMPr64:
	Opcode = X86::JMP64r;
	break;
	case X86::TAILJMPm64:
	Opcode = X86::JMP64m;
	break;
	case X86::TAILJMPr64_REX:
	Opcode = X86::JMP64r_REX;
	break;
	case X86::TAILJMPm64_REX:
	Opcode = X86::JMP64m_REX;
	break;
	case X86::TAILJMPd:
	case X86::TAILJMPd64:
	Opcode = X86::JMP_1;
	break;
	case X86::TAILJMPd_CC:
	case X86::TAILJMPd64_CC:
	Opcode = X86::JCC_1;
	break;
	}

	return Opcode;
	}

	void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
	OutMI.setOpcode(MI->getOpcode());

	for (const MachineOperand &MO : MI->operands())
	if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
	OutMI.addOperand(MaybeMCOp.getValue());

	// Handle a few special cases to eliminate operand modifiers.
	switch (OutMI.getOpcode()) {
	case X86::LEA64_32r:
	case X86::LEA64r:
	case X86::LEA16r:
	case X86::LEA32r:
	// LEA should have a segment register, but it must be empty.
	assert(OutMI.getNumOperands() == 1 + X86::AddrNumOperands &&
	"Unexpected # of LEA operands");
	assert(OutMI.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
	"LEA has segment specified!");
	break;

	// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
	// if one of the registers is extended, but other isn't.
	case X86::VMOVZPQILo2PQIrr:
	case X86::VMOVAPDrr:
	case X86::VMOVAPDYrr:
	case X86::VMOVAPSrr:
	case X86::VMOVAPSYrr:
	case X86::VMOVDQArr:
	case X86::VMOVDQAYrr:
	case X86::VMOVDQUrr:
	case X86::VMOVDQUYrr:
	case X86::VMOVUPDrr:
	case X86::VMOVUPDYrr:
	case X86::VMOVUPSrr:
	case X86::VMOVUPSYrr: {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
	case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
	case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
	case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
	case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
	case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
	case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
	case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
	case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
	case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
	case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
	case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
	case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
	}
	OutMI.setOpcode(NewOpc);
	}
	break;
	}
	case X86::VMOVSDrr:
	case X86::VMOVSSrr: {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
	case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
	}
	OutMI.setOpcode(NewOpc);
	}
	break;
	}

	case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik:
	case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik:
	case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik:
	case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik:
	case X86::VPCMPBZrmi: case X86::VPCMPBZrmik:
	case X86::VPCMPBZrri: case X86::VPCMPBZrrik:
	case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik:
	case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
	case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik:
	case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik:
	case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
	case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik:
	case X86::VPCMPDZrmi: case X86::VPCMPDZrmik:
	case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
	case X86::VPCMPDZrri: case X86::VPCMPDZrrik:
	case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik:
	case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
	case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik:
	case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik:
	case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
	case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik:
	case X86::VPCMPQZrmi: case X86::VPCMPQZrmik:
	case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
	case X86::VPCMPQZrri: case X86::VPCMPQZrrik:
	case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik:
	case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik:
	case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik:
	case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik:
	case X86::VPCMPWZrmi: case X86::VPCMPWZrmik:
	case X86::VPCMPWZrri: case X86::VPCMPWZrrik: {
	// Turn immediate 0 into the VPCMPEQ instruction.
	if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break;
	case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break;
	case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break;
	case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break;
	case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break;
	case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break;
	case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break;
	case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break;
	case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break;
	case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break;
	case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break;
	case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break;
	case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break;
	case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break;
	case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break;
	case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break;
	case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break;
	case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break;
	case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break;
	case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break;
	case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break;
	case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break;
	case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break;
	case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break;
	case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break;
	case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break;
	case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break;
	case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break;
	case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break;
	case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break;
	case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break;
	case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break;
	case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break;
	case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break;
	case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break;
	case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break;
	case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break;
	case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break;
	case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break;
	case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break;
	case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break;
	case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break;
	case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break;
	case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break;
	case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break;
	case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break;
	case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break;
	case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break;
	case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break;
	case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break;
	case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break;
	case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break;
	case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break;
	case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break;
	case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break;
	case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break;
	case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break;
	case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break;
	case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break;
	case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break;
	}

	OutMI.setOpcode(NewOpc);
	OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
	break;
	}

	// Turn immediate 6 into the VPCMPGT instruction.
	if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break;
	case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break;
	case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break;
	case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break;
	case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break;
	case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break;
	case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break;
	case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break;
	case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break;
	case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break;
	case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break;
	case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break;
	case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break;
	case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break;
	case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break;
	case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break;
	case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break;
	case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break;
	case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break;
	case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break;
	case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break;
	case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break;
	case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break;
	case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break;
	case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break;
	case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break;
	case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break;
	case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break;
	case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break;
	case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break;
	case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break;
	case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break;
	case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break;
	case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break;
	case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break;
	case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break;
	case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break;
	case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break;
	case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break;
	case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break;
	case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break;
	case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break;
	case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break;
	case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break;
	case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break;
	case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break;
	case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break;
	case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break;
	case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break;
	case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break;
	case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break;
	case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break;
	case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break;
	case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break;
	case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break;
	case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break;
	case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break;
	case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break;
	case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break;
	case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break;
	}

	OutMI.setOpcode(NewOpc);
	OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
	break;
	}

	break;
	}

	// CALL64r, CALL64pcrel32 - These instructions used to have
	// register inputs modeled as normal uses instead of implicit uses. As such,
	// they we used to truncate off all but the first operand (the callee). This
	// issue seems to have been fixed at some point. This assert verifies that.
	case X86::CALL64r:
	case X86::CALL64pcrel32:
	assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
	break;

	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
	break;
	}

	case X86::CLEANUPRET: {
	// Replace CLEANUPRET with the appropriate RET.
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
	break;
	}

	case X86::CATCHRET: {
	// Replace CATCHRET with the appropriate RET.
	const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
	unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(Subtarget));
	OutMI.addOperand(MCOperand::createReg(ReturnReg));
	break;
	}

	// TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
	// instruction.
	case X86::TAILJMPr:
	case X86::TAILJMPr64:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPd:
	case X86::TAILJMPd64:
	assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
	OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
	break;

	case X86::TAILJMPd_CC:
	case X86::TAILJMPd64_CC:
	assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!");
	OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
	break;

	case X86::TAILJMPm:
	case X86::TAILJMPm64:
	case X86::TAILJMPm64_REX:
	assert(OutMI.getNumOperands() == X86::AddrNumOperands &&
	"Unexpected number of operands!");
	OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
	break;

	case X86::DEC16r:
	case X86::DEC32r:
	case X86::INC16r:
	case X86::INC32r:
	// If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
	if (!AsmPrinter.getSubtarget().is64Bit()) {
	unsigned Opcode;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
	case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
	case X86::INC16r: Opcode = X86::INC16r_alt; break;
	case X86::INC32r: Opcode = X86::INC32r_alt; break;
	}
	OutMI.setOpcode(Opcode);
	}
	break;

	// We don't currently select the correct instruction form for instructions
	// which have a short %eax, etc. form. Handle this by custom lowering, for
	// now.
	//
	// Note, we are currently not handling the following instructions:
	// MOV64ao8, MOV64o8a
	// XCHG16ar, XCHG32ar, XCHG64ar
	case X86::MOV8mr_NOREX:
	case X86::MOV8mr:
	case X86::MOV8rm_NOREX:
	case X86::MOV8rm:
	case X86::MOV16mr:
	case X86::MOV16rm:
	case X86::MOV32mr:
	case X86::MOV32rm: {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::MOV8mr_NOREX:
	case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
	case X86::MOV8rm_NOREX:
	case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
	case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
	case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
	case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
	case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
	}
	SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
	break;
	}

	case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
	case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
	case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
	case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
	case X86::OR8ri: case X86::OR16ri: case X86::OR32ri: case X86::OR64ri32:
	case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
	case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
	case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
	case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::ADC8ri: NewOpc = X86::ADC8i8; break;
	case X86::ADC16ri: NewOpc = X86::ADC16i16; break;
	case X86::ADC32ri: NewOpc = X86::ADC32i32; break;
	case X86::ADC64ri32: NewOpc = X86::ADC64i32; break;
	case X86::ADD8ri: NewOpc = X86::ADD8i8; break;
	case X86::ADD16ri: NewOpc = X86::ADD16i16; break;
	case X86::ADD32ri: NewOpc = X86::ADD32i32; break;
	case X86::ADD64ri32: NewOpc = X86::ADD64i32; break;
	case X86::AND8ri: NewOpc = X86::AND8i8; break;
	case X86::AND16ri: NewOpc = X86::AND16i16; break;
	case X86::AND32ri: NewOpc = X86::AND32i32; break;
	case X86::AND64ri32: NewOpc = X86::AND64i32; break;
	case X86::CMP8ri: NewOpc = X86::CMP8i8; break;
	case X86::CMP16ri: NewOpc = X86::CMP16i16; break;
	case X86::CMP32ri: NewOpc = X86::CMP32i32; break;
	case X86::CMP64ri32: NewOpc = X86::CMP64i32; break;
	case X86::OR8ri: NewOpc = X86::OR8i8; break;
	case X86::OR16ri: NewOpc = X86::OR16i16; break;
	case X86::OR32ri: NewOpc = X86::OR32i32; break;
	case X86::OR64ri32: NewOpc = X86::OR64i32; break;
	case X86::SBB8ri: NewOpc = X86::SBB8i8; break;
	case X86::SBB16ri: NewOpc = X86::SBB16i16; break;
	case X86::SBB32ri: NewOpc = X86::SBB32i32; break;
	case X86::SBB64ri32: NewOpc = X86::SBB64i32; break;
	case X86::SUB8ri: NewOpc = X86::SUB8i8; break;
	case X86::SUB16ri: NewOpc = X86::SUB16i16; break;
	case X86::SUB32ri: NewOpc = X86::SUB32i32; break;
	case X86::SUB64ri32: NewOpc = X86::SUB64i32; break;
	case X86::TEST8ri: NewOpc = X86::TEST8i8; break;
	case X86::TEST16ri: NewOpc = X86::TEST16i16; break;
	case X86::TEST32ri: NewOpc = X86::TEST32i32; break;
	case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
	case X86::XOR8ri: NewOpc = X86::XOR8i8; break;
	case X86::XOR16ri: NewOpc = X86::XOR16i16; break;
	case X86::XOR32ri: NewOpc = X86::XOR32i32; break;
	case X86::XOR64ri32: NewOpc = X86::XOR64i32; break;
	}
	SimplifyShortImmForm(OutMI, NewOpc);
	break;
	}

	// Try to shrink some forms of movsx.
	case X86::MOVSX16rr8:
	case X86::MOVSX32rr16:
	case X86::MOVSX64rr32:
	SimplifyMOVSX(OutMI);
	break;

	case X86::VCMPPDrri:
	case X86::VCMPPDYrri:
	case X86::VCMPPSrri:
	case X86::VCMPPSYrri:
	case X86::VCMPSDrr:
	case X86::VCMPSSrr: {
	// Swap the operands if it will enable a 2 byte VEX encoding.
	// FIXME: Change the immediate to improve opportunities?
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
	unsigned Imm = MI->getOperand(3).getImm() & 0x7;
	switch (Imm) {
	default: break;
	case 0x00: // EQUAL
	case 0x03: // UNORDERED
	case 0x04: // NOT EQUAL
	case 0x07: // ORDERED
	std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
	break;
	}
	}
	break;
	}

	case X86::VMOVHLPSrr:
	case X86::VUNPCKHPDrr:
	// These are not truly commutable so hide them from the default case.
	break;

	default: {
	// If the instruction is a commutable arithmetic instruction we might be
	// able to commute the operands to get a 2 byte VEX prefix.
	uint64_t TSFlags = MI->getDesc().TSFlags;
	if (MI->getDesc().isCommutable() &&
	(TSFlags & X86II::EncodingMask) == X86II::VEX &&
	(TSFlags & X86II::OpMapMask) == X86II::TB &&
	(TSFlags & X86II::FormMask) == X86II::MRMSrcReg &&
	!(TSFlags & X86II::VEX_W) && (TSFlags & X86II::VEX_4V) &&
	OutMI.getNumOperands() == 3) {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg()))
	std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
	}
	break;
	}
	}
	}

	void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
	const MachineInstr &MI) {
	bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 \|\|
	MI.getOpcode() == X86::TLS_base_addr64;
	MCContext &Ctx = OutStreamer->getContext();

	MCSymbolRefExpr::VariantKind SRVK;
	switch (MI.getOpcode()) {
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	SRVK = MCSymbolRefExpr::VK_TLSGD;
	break;
	case X86::TLS_base_addr32:
	SRVK = MCSymbolRefExpr::VK_TLSLDM;
	break;
	case X86::TLS_base_addr64:
	SRVK = MCSymbolRefExpr::VK_TLSLD;
	break;
	default:
	llvm_unreachable("unexpected opcode");
	}

	const MCSymbolRefExpr *Sym = MCSymbolRefExpr::create(
	MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)), SRVK, Ctx);

	// As of binutils 2.32, ld has a bogus TLS relaxation error when the GD/LD
	// code sequence using R_X86_64_GOTPCREL (instead of R_X86_64_GOTPCRELX) is
	// attempted to be relaxed to IE/LE (binutils PR24784). Work around the bug by
	// only using GOT when GOTPCRELX is enabled.
	// TODO Delete the workaround when GOTPCRELX becomes commonplace.
	bool UseGot = MMI->getModule()->getRtLibUseGOT() &&
	Ctx.getAsmInfo()->canRelaxRelocations();

	if (Is64Bits) {
	bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
	if (NeedsPadding)
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
	.addReg(X86::RDI)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addExpr(Sym)
	.addReg(0));
	const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("__tls_get_addr");
	if (NeedsPadding) {
	if (!UseGot)
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
	}
	if (UseGot) {
	const MCExpr *Expr = MCSymbolRefExpr::create(
	TlsGetAddr, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64m)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addExpr(Expr)
	.addReg(0));
	} else {
	EmitAndCountInstruction(
	MCInstBuilder(X86::CALL64pcrel32)
	.addExpr(MCSymbolRefExpr::create(TlsGetAddr,
	MCSymbolRefExpr::VK_PLT, Ctx)));
	}
	} else {
	if (SRVK == MCSymbolRefExpr::VK_TLSGD && !UseGot) {
	EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
	.addReg(X86::EAX)
	.addReg(0)
	.addImm(1)
	.addReg(X86::EBX)
	.addExpr(Sym)
	.addReg(0));
	} else {
	EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
	.addReg(X86::EAX)
	.addReg(X86::EBX)
	.addImm(1)
	.addReg(0)
	.addExpr(Sym)
	.addReg(0));
	}

	const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("___tls_get_addr");
	if (UseGot) {
	const MCExpr *Expr =
	MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_GOT, Ctx);
	EmitAndCountInstruction(MCInstBuilder(X86::CALL32m)
	.addReg(X86::EBX)
	.addImm(1)
	.addReg(0)
	.addExpr(Expr)
	.addReg(0));
	} else {
	EmitAndCountInstruction(
	MCInstBuilder(X86::CALLpcrel32)
	.addExpr(MCSymbolRefExpr::create(TlsGetAddr,
	MCSymbolRefExpr::VK_PLT, Ctx)));
	}
	}
	}

	/// Return the longest nop which can be efficiently decoded for the given
	/// target cpu. 15-bytes is the longest single NOP instruction, but some
	/// platforms can't decode the longest forms efficiently.
	static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) {
	uint64_t MaxNopLength = 10;
	if (STI.getFeatureBits()[X86::ProcIntelSLM])
	MaxNopLength = 7;
	else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
	MaxNopLength = 15;
	else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
	MaxNopLength = 11;
	return MaxNopLength;
	}

	/// Emit the largest nop instruction smaller than or equal to \p NumBytes
	/// bytes. Return the size of nop emitted.
	static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI) {
	if (!Is64Bit) {
	// TODO Do additional checking if the CPU supports multi-byte nops.
	OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI);
	return 1;
	}

	// Cap a single nop emission at the profitable value for the target
	NumBytes = std::min(NumBytes, MaxLongNopLength(STI));

	unsigned NopSize;
	unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
	IndexReg = Displacement = SegmentReg = 0;
	BaseReg = X86::RAX;
	ScaleVal = 1;
	switch (NumBytes) {
	case 0:
	llvm_unreachable("Zero nops?");
	break;
	case 1:
	NopSize = 1;
	Opc = X86::NOOP;
	break;
	case 2:
	NopSize = 2;
	Opc = X86::XCHG16ar;
	break;
	case 3:
	NopSize = 3;
	Opc = X86::NOOPL;
	break;
	case 4:
	NopSize = 4;
	Opc = X86::NOOPL;
	Displacement = 8;
	break;
	case 5:
	NopSize = 5;
	Opc = X86::NOOPL;
	Displacement = 8;
	IndexReg = X86::RAX;
	break;
	case 6:
	NopSize = 6;
	Opc = X86::NOOPW;
	Displacement = 8;
	IndexReg = X86::RAX;
	break;
	case 7:
	NopSize = 7;
	Opc = X86::NOOPL;
	Displacement = 512;
	break;
	case 8:
	NopSize = 8;
	Opc = X86::NOOPL;
	Displacement = 512;
	IndexReg = X86::RAX;
	break;
	case 9:
	NopSize = 9;
	Opc = X86::NOOPW;
	Displacement = 512;
	IndexReg = X86::RAX;
	break;
	default:
	NopSize = 10;
	Opc = X86::NOOPW;
	Displacement = 512;
	IndexReg = X86::RAX;
	SegmentReg = X86::CS;
	break;
	}

	unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
	NopSize += NumPrefixes;
	for (unsigned i = 0; i != NumPrefixes; ++i)
	OS.EmitBytes("\x66");

	switch (Opc) {
	default: llvm_unreachable("Unexpected opcode");
	case X86::NOOP:
	OS.EmitInstruction(MCInstBuilder(Opc), STI);
	break;
	case X86::XCHG16ar:
	OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI);
	break;
	case X86::NOOPL:
	case X86::NOOPW:
	OS.EmitInstruction(MCInstBuilder(Opc)
	.addReg(BaseReg)
	.addImm(ScaleVal)
	.addReg(IndexReg)
	.addImm(Displacement)
	.addReg(SegmentReg),
	STI);
	break;
	}
	assert(NopSize <= NumBytes && "We overemitted?");
	return NopSize;
	}

	/// Emit the optimal amount of multi-byte nops on X86.
	static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI) {
	unsigned NopsToEmit = NumBytes;
	(void)NopsToEmit;
	while (NumBytes) {
	NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
	assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
	}
	}

	/// A RAII helper which defines a region of instructions which can't have
	/// padding added between them for correctness.
	struct NoAutoPaddingScope {
	MCStreamer &OS;
	const bool OldAllowAutoPadding;
	NoAutoPaddingScope(MCStreamer &OS)
	: OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
	changeAndComment(false);
	}
	~NoAutoPaddingScope() {
	changeAndComment(OldAllowAutoPadding);
	}
	void changeAndComment(bool b) {
	if (b == OS.getAllowAutoPadding())
	return;
	OS.setAllowAutoPadding(b);
	if (b)
	OS.emitRawComment("autopadding");
	else
	OS.emitRawComment("noautopadding");
	}
	};

	void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	StatepointOpers SOpers(&MI);
	if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
	EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
	getSubtargetInfo());
	} else {
	// Lower call target and choose correct opcode
	const MachineOperand &CallTarget = SOpers.getCallTarget();
	MCOperand CallTargetMCOp;
	unsigned CallOpcode;
	switch (CallTarget.getType()) {
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	CallTargetMCOp = MCIL.LowerSymbolOperand(
	CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
	CallOpcode = X86::CALL64pcrel32;
	// Currently, we only support relative addressing with statepoints.
	// Otherwise, we'll need a scratch register to hold the target
	// address. You'll fail asserts during load & relocation if this
	// symbol is to far away. (TODO: support non-relative addressing)
	break;
	case MachineOperand::MO_Immediate:
	CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
	CallOpcode = X86::CALL64pcrel32;
	// Currently, we only support relative addressing with statepoints.
	// Otherwise, we'll need a scratch register to hold the target
	// immediate. You'll fail asserts during load & relocation if this
	// address is to far away. (TODO: support non-relative addressing)
	break;
	case MachineOperand::MO_Register:
	// FIXME: Add retpoline support and remove this.
	- if (Subtarget->useRetpolineIndirectCalls())
	- report_fatal_error("Lowering register statepoints with retpoline not "
	+ if (Subtarget->useIndirectThunkCalls())
	+ report_fatal_error("Lowering register statepoints with thunks not "
	"yet implemented.");
	CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
	CallOpcode = X86::CALL64r;
	break;
	default:
	llvm_unreachable("Unsupported operand type in statepoint call target");
	break;
	}

	// Emit call
	MCInst CallInst;
	CallInst.setOpcode(CallOpcode);
	CallInst.addOperand(CallTargetMCOp);
	OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
	}

	// Record our statepoint node in the same section used by STACKMAP
	// and PATCHPOINT
	auto &Ctx = OutStreamer->getContext();
	MCSymbol *MILabel = Ctx.createTempSymbol();
	OutStreamer->EmitLabel(MILabel);
	SM.recordStatepoint(*MILabel, MI);
	}

	void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
	X86MCInstLower &MCIL) {
	// FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
	// <opcode>, <operands>

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	Register DefRegister = FaultingMI.getOperand(0).getReg();
	FaultMaps::FaultKind FK =
	static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
	MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
	unsigned Opcode = FaultingMI.getOperand(3).getImm();
	unsigned OperandsBeginIdx = 4;

	auto &Ctx = OutStreamer->getContext();
	MCSymbol *FaultingLabel = Ctx.createTempSymbol();
	OutStreamer->EmitLabel(FaultingLabel);

	assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
	FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);

	MCInst MI;
	MI.setOpcode(Opcode);

	if (DefRegister != X86::NoRegister)
	MI.addOperand(MCOperand::createReg(DefRegister));

	for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
	E = FaultingMI.operands_end();
	I != E; ++I)
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
	MI.addOperand(MaybeOperand.getValue());

	OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
	OutStreamer->EmitInstruction(MI, getSubtargetInfo());
	}

	void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	bool Is64Bits = Subtarget->is64Bit();
	MCContext &Ctx = OutStreamer->getContext();
	MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
	const MCSymbolRefExpr *Op =
	MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx);

	EmitAndCountInstruction(
	MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
	.addExpr(Op));
	}

	void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// PATCHABLE_OP minsize, opcode, operands

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	unsigned MinSize = MI.getOperand(0).getImm();
	unsigned Opcode = MI.getOperand(1).getImm();

	MCInst MCI;
	MCI.setOpcode(Opcode);
	for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	MCI.addOperand(MaybeOperand.getValue());

	SmallString<256> Code;
	SmallVector<MCFixup, 4> Fixups;
	raw_svector_ostream VecOS(Code);
	CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());

	if (Code.size() < MinSize) {
	if (MinSize == 2 && Opcode == X86::PUSH64r) {
	// This is an optimization that lets us get away without emitting a nop in
	// many cases.
	//
	// NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %r9) takes two
	// bytes too, so the check on MinSize is important.
	MCI.setOpcode(X86::PUSH64rmr);
	} else {
	unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
	getSubtargetInfo());
	assert(NopSize == MinSize && "Could not implement MinSize!");
	(void)NopSize;
	}
	}

	OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
	}

	// Lower a stackmap of the form:
	// <id>, <shadowBytes>, ...
	void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());

	auto &Ctx = OutStreamer->getContext();
	MCSymbol *MILabel = Ctx.createTempSymbol();
	OutStreamer->EmitLabel(MILabel);

	SM.recordStackMap(*MILabel, MI);
	unsigned NumShadowBytes = MI.getOperand(1).getImm();
	SMShadowTracker.reset(NumShadowBytes);
	}

	// Lower a patchpoint of the form:
	// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
	void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");

	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	auto &Ctx = OutStreamer->getContext();
	MCSymbol *MILabel = Ctx.createTempSymbol();
	OutStreamer->EmitLabel(MILabel);
	SM.recordPatchPoint(*MILabel, MI);

	PatchPointOpers opers(&MI);
	unsigned ScratchIdx = opers.getNextScratchIdx();
	unsigned EncodedBytes = 0;
	const MachineOperand &CalleeMO = opers.getCallTarget();

	// Check for null target. If target is non-null (i.e. is non-zero or is
	// symbolic) then emit a call.
	if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
	MCOperand CalleeMCOp;
	switch (CalleeMO.getType()) {
	default:
	/// FIXME: Add a verifier check for bad callee types.
	llvm_unreachable("Unrecognized callee operand type.");
	case MachineOperand::MO_Immediate:
	if (CalleeMO.getImm())
	CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
	break;
	case MachineOperand::MO_ExternalSymbol:
	case MachineOperand::MO_GlobalAddress:
	CalleeMCOp = MCIL.LowerSymbolOperand(CalleeMO,
	MCIL.GetSymbolFromOperand(CalleeMO));
	break;
	}

	// Emit MOV to materialize the target address and the CALL to target.
	// This is encoded with 12-13 bytes, depending on which register is used.
	Register ScratchReg = MI.getOperand(ScratchIdx).getReg();
	if (X86II::isX86_64ExtendedReg(ScratchReg))
	EncodedBytes = 13;
	else
	EncodedBytes = 12;

	EmitAndCountInstruction(
	MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
	// FIXME: Add retpoline support and remove this.
	- if (Subtarget->useRetpolineIndirectCalls())
	+ if (Subtarget->useIndirectThunkCalls())
	report_fatal_error(
	- "Lowering patchpoint with retpoline not yet implemented.");
	+ "Lowering patchpoint with thunks not yet implemented.");
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
	}

	// Emit padding.
	unsigned NumBytes = opers.getNumPatchBytes();
	assert(NumBytes >= EncodedBytes &&
	"Patchpoint can't request size less than the length of a call.");

	EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
	getSubtargetInfo());
	}

	void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	// We want to emit the following pattern, which follows the x86 calling
	// convention to prepare for the trampoline call to be patched in.
	//
	// .p2align 1, ...
	// .Lxray_event_sled_N:
	// jmp +N // jump across the instrumentation sled
	// ... // set up arguments in register
	// callq __xray_CustomEvent@plt // force dependency to symbol
	// ...
	// <jump here>
	//
	// After patching, it would look something like:
	//
	// nopw (2-byte nop)
	// ...
	// callq __xrayCustomEvent // already lowered
	// ...
	//
	// ---
	// First we emit the label and the jump.
	auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
	OutStreamer->AddComment("# XRay Custom Event Log");
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBinaryData("\xeb\x0f");

	// The default C calling convention will place two arguments into %rcx and
	// %rdx -- so we only work with those.
	const Register DestRegs[] = {X86::RDI, X86::RSI};
	bool UsedMask[] = {false, false};
	// Filled out in loop.
	Register SrcRegs[] = {0, 0};

	// Then we put the operands in the %rdi and %rsi registers. We spill the
	// values in the register before we clobber them, and mark them as used in
	// UsedMask. In case the arguments are already in the correct register, we use
	// emit nops appropriately sized to keep the sled the same size in every
	// situation.
	for (unsigned I = 0; I < MI.getNumOperands(); ++I)
	if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
	assert(Op->isReg() && "Only support arguments in registers");
	SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
	if (SrcRegs[I] != DestRegs[I]) {
	UsedMask[I] = true;
	EmitAndCountInstruction(
	MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
	} else {
	EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
	}
	}

	// Now that the register values are stashed, mov arguments into place.
	// FIXME: This doesn't work if one of the later SrcRegs is equal to an
	// earlier DestReg. We will have already overwritten over the register before
	// we can copy from it.
	for (unsigned I = 0; I < MI.getNumOperands(); ++I)
	if (SrcRegs[I] != DestRegs[I])
	EmitAndCountInstruction(
	MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));

	// We emit a hard dependency on the __xray_CustomEvent symbol, which is the
	// name of the trampoline to be implemented by the XRay runtime.
	auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
	MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
	if (isPositionIndependent())
	TOp.setTargetFlags(X86II::MO_PLT);

	// Emit the call instruction.
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
	.addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));

	// Restore caller-saved and used registers.
	for (unsigned I = sizeof UsedMask; I-- > 0;)
	if (UsedMask[I])
	EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
	else
	EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());

	OutStreamer->AddComment("xray custom event end.");

	// Record the sled version. Older versions of this sled were spelled
	// differently, so we let the runtime handle the different offsets we're
	// using.
	recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
	}

	void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	// We want to emit the following pattern, which follows the x86 calling
	// convention to prepare for the trampoline call to be patched in.
	//
	// .p2align 1, ...
	// .Lxray_event_sled_N:
	// jmp +N // jump across the instrumentation sled
	// ... // set up arguments in register
	// callq __xray_TypedEvent@plt // force dependency to symbol
	// ...
	// <jump here>
	//
	// After patching, it would look something like:
	//
	// nopw (2-byte nop)
	// ...
	// callq __xrayTypedEvent // already lowered
	// ...
	//
	// ---
	// First we emit the label and the jump.
	auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
	OutStreamer->AddComment("# XRay Typed Event Log");
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBinaryData("\xeb\x14");

	// An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
	// so we'll work with those. Or we may be called via SystemV, in which case
	// we don't have to do any translation.
	const Register DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
	bool UsedMask[] = {false, false, false};

	// Will fill out src regs in the loop.
	Register SrcRegs[] = {0, 0, 0};

	// Then we put the operands in the SystemV registers. We spill the values in
	// the registers before we clobber them, and mark them as used in UsedMask.
	// In case the arguments are already in the correct register, we emit nops
	// appropriately sized to keep the sled the same size in every situation.
	for (unsigned I = 0; I < MI.getNumOperands(); ++I)
	if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
	// TODO: Is register only support adequate?
	assert(Op->isReg() && "Only supports arguments in registers");
	SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
	if (SrcRegs[I] != DestRegs[I]) {
	UsedMask[I] = true;
	EmitAndCountInstruction(
	MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
	} else {
	EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
	}
	}

	// In the above loop we only stash all of the destination registers or emit
	// nops if the arguments are already in the right place. Doing the actually
	// moving is postponed until after all the registers are stashed so nothing
	// is clobbers. We've already added nops to account for the size of mov and
	// push if the register is in the right place, so we only have to worry about
	// emitting movs.
	// FIXME: This doesn't work if one of the later SrcRegs is equal to an
	// earlier DestReg. We will have already overwritten over the register before
	// we can copy from it.
	for (unsigned I = 0; I < MI.getNumOperands(); ++I)
	if (UsedMask[I])
	EmitAndCountInstruction(
	MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));

	// We emit a hard dependency on the __xray_TypedEvent symbol, which is the
	// name of the trampoline to be implemented by the XRay runtime.
	auto TSym = OutContext.getOrCreateSymbol("__xray_TypedEvent");
	MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
	if (isPositionIndependent())
	TOp.setTargetFlags(X86II::MO_PLT);

	// Emit the call instruction.
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
	.addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));

	// Restore caller-saved and used registers.
	for (unsigned I = sizeof UsedMask; I-- > 0;)
	if (UsedMask[I])
	EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
	else
	EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());

	OutStreamer->AddComment("xray typed event end.");

	// Record the sled version.
	recordSled(CurSled, MI, SledKind::TYPED_EVENT, 0);
	}

	void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
	X86MCInstLower &MCIL) {

	NoAutoPaddingScope NoPadScope(*OutStreamer);

	const Function &F = MF->getFunction();
	if (F.hasFnAttribute("patchable-function-entry")) {
	unsigned Num;
	if (F.getFnAttribute("patchable-function-entry")
	.getValueAsString()
	.getAsInteger(10, Num))
	return;
	EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo());
	return;
	}
	// We want to emit the following pattern:
	//
	// .p2align 1, ...
	// .Lxray_sled_N:
	// jmp .tmpN
	// # 9 bytes worth of noops
	//
	// We need the 9 bytes because at runtime, we'd be patching over the full 11
	// bytes with the following pattern:
	//
	// mov %r10, <function id, 32-bit> // 6 bytes
	// call <relative offset, 32-bits> // 5 bytes
	//
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBytes("\xeb\x09");
	EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
	recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
	}

	void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	NoAutoPaddingScope NoPadScope(*OutStreamer);

	// Since PATCHABLE_RET takes the opcode of the return statement as an
	// argument, we use that to emit the correct form of the RET that we want.
	// i.e. when we see this:
	//
	// PATCHABLE_RET X86::RET ...
	//
	// We should emit the RET followed by sleds.
	//
	// .p2align 1, ...
	// .Lxray_sled_N:
	// ret # or equivalent instruction
	// # 10 bytes worth of noops
	//
	// This just makes sure that the alignment for the next instruction is 2.
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	unsigned OpCode = MI.getOperand(0).getImm();
	MCInst Ret;
	Ret.setOpcode(OpCode);
	for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	Ret.addOperand(MaybeOperand.getValue());
	OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
	EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
	recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
	}

	void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	NoAutoPaddingScope NoPadScope(*OutStreamer);

	// Like PATCHABLE_RET, we have the actual instruction in the operands to this
	// instruction so we lower that particular instruction and its operands.
	// Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
	// we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
	// the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
	// tail call much like how we have it in PATCHABLE_RET.
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	auto Target = OutContext.createTempSymbol();

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBytes("\xeb\x09");
	EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
	OutStreamer->EmitLabel(Target);
	recordSled(CurSled, MI, SledKind::TAIL_CALL);

	unsigned OpCode = MI.getOperand(0).getImm();
	OpCode = convertTailJumpOpcode(OpCode);
	MCInst TC;
	TC.setOpcode(OpCode);

	// Before emitting the instruction, add a comment to indicate that this is
	// indeed a tail call.
	OutStreamer->AddComment("TAILCALL");
	for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	TC.addOperand(MaybeOperand.getValue());
	OutStreamer->EmitInstruction(TC, getSubtargetInfo());
	}

	// Returns instruction preceding MBBI in MachineFunction.
	// If MBBI is the first instruction of the first basic block, returns null.
	static MachineBasicBlock::const_iterator
	PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
	const MachineBasicBlock *MBB = MBBI->getParent();
	while (MBBI == MBB->begin()) {
	if (MBB == &MBB->getParent()->front())
	return MachineBasicBlock::const_iterator();
	MBB = MBB->getPrevNode();
	MBBI = MBB->end();
	}
	--MBBI;
	return MBBI;
	}

	static const Constant *getConstantFromPool(const MachineInstr &MI,
	const MachineOperand &Op) {
	if (!Op.isCPI() \|\| Op.getOffset() != 0)
	return nullptr;

	ArrayRef<MachineConstantPoolEntry> Constants =
	MI.getParent()->getParent()->getConstantPool()->getConstants();
	const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()];

	// Bail if this is a machine constant pool entry, we won't be able to dig out
	// anything useful.
	if (ConstantEntry.isMachineConstantPoolEntry())
	return nullptr;

	const Constant *C = ConstantEntry.Val.ConstVal;
	assert((!C \|\| ConstantEntry.getType() == C->getType()) &&
	"Expected a constant of the same type!");
	return C;
	}

	static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
	unsigned SrcOp2Idx, ArrayRef<int> Mask) {
	std::string Comment;

	// Compute the name for a register. This is really goofy because we have
	// multiple instruction printers that could (in theory) use different
	// names. Fortunately most people use the ATT style (outside of Windows)
	// and they actually agree on register naming here. Ultimately, this is
	// a comment, and so its OK if it isn't perfect.
	auto GetRegisterName = [](unsigned RegNum) -> StringRef {
	return X86ATTInstPrinter::getRegisterName(RegNum);
	};

	const MachineOperand &DstOp = MI->getOperand(0);
	const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
	const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);

	StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
	StringRef Src1Name =
	SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
	StringRef Src2Name =
	SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";

	// One source operand, fix the mask to print all elements in one span.
	SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
	if (Src1Name == Src2Name)
	for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
	if (ShuffleMask[i] >= e)
	ShuffleMask[i] -= e;

	raw_string_ostream CS(Comment);
	CS << DstName;

	// Handle AVX512 MASK/MASXZ write mask comments.
	// MASK: zmmX {%kY}
	// MASKZ: zmmX {%kY} {z}
	if (SrcOp1Idx > 1) {
	assert((SrcOp1Idx == 2 \|\| SrcOp1Idx == 3) && "Unexpected writemask");

	const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
	if (WriteMaskOp.isReg()) {
	CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";

	if (SrcOp1Idx == 2) {
	CS << " {z}";
	}
	}
	}

	CS << " = ";

	for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
	if (i != 0)
	CS << ",";
	if (ShuffleMask[i] == SM_SentinelZero) {
	CS << "zero";
	continue;
	}

	// Otherwise, it must come from src1 or src2. Print the span of elements
	// that comes from this src.
	bool isSrc1 = ShuffleMask[i] < (int)e;
	CS << (isSrc1 ? Src1Name : Src2Name) << '[';

	bool IsFirst = true;
	while (i != e && ShuffleMask[i] != SM_SentinelZero &&
	(ShuffleMask[i] < (int)e) == isSrc1) {
	if (!IsFirst)
	CS << ',';
	else
	IsFirst = false;
	if (ShuffleMask[i] == SM_SentinelUndef)
	CS << "u";
	else
	CS << ShuffleMask[i] % (int)e;
	++i;
	}
	CS << ']';
	--i; // For loop increments element #.
	}
	CS.flush();

	return Comment;
	}

	static void printConstant(const APInt &Val, raw_ostream &CS) {
	if (Val.getBitWidth() <= 64) {
	CS << Val.getZExtValue();
	} else {
	// print multi-word constant as (w0,w1)
	CS << "(";
	for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
	if (i > 0)
	CS << ",";
	CS << Val.getRawData()[i];
	}
	CS << ")";
	}
	}

	static void printConstant(const APFloat &Flt, raw_ostream &CS) {
	SmallString<32> Str;
	// Force scientific notation to distinquish from integers.
	Flt.toString(Str, 0, 0);
	CS << Str;
	}

	static void printConstant(const Constant *COp, raw_ostream &CS) {
	if (isa<UndefValue>(COp)) {
	CS << "u";
	} else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
	printConstant(CI->getValue(), CS);
	} else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
	printConstant(CF->getValueAPF(), CS);
	} else {
	CS << "?";
	}
	}

	void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");

	// Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
	if (EmitFPOData) {
	X86TargetStreamer *XTS =
	static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
	switch (MI->getOpcode()) {
	case X86::SEH_PushReg:
	XTS->emitFPOPushReg(MI->getOperand(0).getImm());
	break;
	case X86::SEH_StackAlloc:
	XTS->emitFPOStackAlloc(MI->getOperand(0).getImm());
	break;
	case X86::SEH_StackAlign:
	XTS->emitFPOStackAlign(MI->getOperand(0).getImm());
	break;
	case X86::SEH_SetFrame:
	assert(MI->getOperand(1).getImm() == 0 &&
	".cv_fpo_setframe takes no offset");
	XTS->emitFPOSetFrame(MI->getOperand(0).getImm());
	break;
	case X86::SEH_EndPrologue:
	XTS->emitFPOEndPrologue();
	break;
	case X86::SEH_SaveReg:
	case X86::SEH_SaveXMM:
	case X86::SEH_PushFrame:
	llvm_unreachable("SEH_ directive incompatible with FPO");
	break;
	default:
	llvm_unreachable("expected SEH_ instruction");
	}
	return;
	}

	// Otherwise, use the .seh_ directives for all other Windows platforms.
	switch (MI->getOpcode()) {
	case X86::SEH_PushReg:
	OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
	break;

	case X86::SEH_SaveReg:
	OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
	MI->getOperand(1).getImm());
	break;

	case X86::SEH_SaveXMM:
	OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
	MI->getOperand(1).getImm());
	break;

	case X86::SEH_StackAlloc:
	OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
	break;

	case X86::SEH_SetFrame:
	OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
	MI->getOperand(1).getImm());
	break;

	case X86::SEH_PushFrame:
	OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
	break;

	case X86::SEH_EndPrologue:
	OutStreamer->EmitWinCFIEndProlog();
	break;

	default:
	llvm_unreachable("expected SEH_ instruction");
	}
	}

	static unsigned getRegisterWidth(const MCOperandInfo &Info) {
	if (Info.RegClass == X86::VR128RegClassID \|\|
	Info.RegClass == X86::VR128XRegClassID)
	return 128;
	if (Info.RegClass == X86::VR256RegClassID \|\|
	Info.RegClass == X86::VR256XRegClassID)
	return 256;
	if (Info.RegClass == X86::VR512RegClassID)
	return 512;
	llvm_unreachable("Unknown register class!");
	}

	void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
	X86MCInstLower MCInstLowering(MF, this);
	const X86RegisterInfo *RI =
	MF->getSubtarget<X86Subtarget>().getRegisterInfo();

	// Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
	// are compressed from EVEX encoding to VEX encoding.
	if (TM.Options.MCOptions.ShowMCEncoding) {
	if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
	OutStreamer->AddComment("EVEX TO VEX Compression ", false);
	}

	switch (MI->getOpcode()) {
	case TargetOpcode::DBG_VALUE:
	llvm_unreachable("Should be handled target independently");

	// Emit nothing here but a comment if we can.
	case X86::Int_MemBarrier:
	OutStreamer->emitRawComment("MEMBARRIER");
	return;

	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	// Lower these as normal, but add some comments.
	Register Reg = MI->getOperand(0).getReg();
	OutStreamer->AddComment(StringRef("eh_return, addr: %") +
	X86ATTInstPrinter::getRegisterName(Reg));
	break;
	}
	case X86::CLEANUPRET: {
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("CLEANUPRET");
	break;
	}

	case X86::CATCHRET: {
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("CATCHRET");
	break;
	}

	case X86::ENDBR32:
	case X86::ENDBR64: {
	// CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
	// -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
	// non-empty. If MI is the initial ENDBR, place the
	// __patchable_function_entries label after ENDBR.
	if (CurrentPatchableFunctionEntrySym &&
	CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
	MI == &MF->front().front()) {
	MCInst Inst;
	MCInstLowering.Lower(MI, Inst);
	EmitAndCountInstruction(Inst);
	CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
	OutStreamer->EmitLabel(CurrentPatchableFunctionEntrySym);
	return;
	}
	break;
	}

	case X86::TAILJMPr:
	case X86::TAILJMPm:
	case X86::TAILJMPd:
	case X86::TAILJMPd_CC:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPd64:
	case X86::TAILJMPd64_CC:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("TAILCALL");
	break;

	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return LowerTlsAddr(MCInstLowering, *MI);

	// Loading/storing mask pairs requires two kmov operations. The second one of these
	// needs a 2 byte displacement relative to the specified address (with 32 bit spill
	// size). The pairs of 1bit masks up to 16 bit masks all use the same spill size,
	// they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD.
	//
	// The displacement value might wrap around in theory, thus the asserts in both
	// cases.
	case X86::MASKPAIR16LOAD: {
	int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
	assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
	Register Reg = MI->getOperand(0).getReg();
	Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
	Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);

	// Load the first mask register
	MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
	MIB.addReg(Reg0);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
	MIB.addOperand(Op.getValue());
	}
	EmitAndCountInstruction(MIB);

	// Load the second mask register of the pair
	MIB = MCInstBuilder(X86::KMOVWkm);
	MIB.addReg(Reg1);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp) {
	MIB.addImm(Disp + 2);
	} else {
	auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
	MIB.addOperand(Op.getValue());
	}
	}
	EmitAndCountInstruction(MIB);
	return;
	}

	case X86::MASKPAIR16STORE: {
	int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
	assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
	Register Reg = MI->getOperand(X86::AddrNumOperands).getReg();
	Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
	Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);

	// Store the first mask register
	MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue());
	MIB.addReg(Reg0);
	EmitAndCountInstruction(MIB);

	// Store the second mask register of the pair
	MIB = MCInstBuilder(X86::KMOVWmk);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp) {
	MIB.addImm(Disp + 2);
	} else {
	auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i));
	MIB.addOperand(Op.getValue());
	}
	}
	MIB.addReg(Reg1);
	EmitAndCountInstruction(MIB);
	return;
	}

	case X86::MOVPC32r: {
	// This is a pseudo op for a two instruction sequence with a label, which
	// looks like:
	// call "L1$pb"
	// "L1$pb":
	// popl %esi

	// Emit the call.
	MCSymbol *PICBase = MF->getPICBaseSymbol();
	// FIXME: We would like an efficient form for this, so we don't have to do a
	// lot of extra uniquing.
	EmitAndCountInstruction(
	MCInstBuilder(X86::CALLpcrel32)
	.addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));

	const X86FrameLowering *FrameLowering =
	MF->getSubtarget<X86Subtarget>().getFrameLowering();
	bool hasFP = FrameLowering->hasFP(*MF);

	// TODO: This is needed only if we require precise CFA.
	bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
	!OutStreamer->getDwarfFrameInfos().back().End;

	int stackGrowth = -RI->getSlotSize();

	if (HasActiveDwarfFrame && !hasFP) {
	OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
	}

	// Emit the label.
	OutStreamer->EmitLabel(PICBase);

	// popl $reg
	EmitAndCountInstruction(
	MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));

	if (HasActiveDwarfFrame && !hasFP) {
	OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
	}
	return;
	}

	case X86::ADD32ri: {
	// Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
	if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
	break;

	// Okay, we have something like:
	// EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)

	// For this, we want to print something like:
	// MYGLOBAL + (. - PICBASE)
	// However, we can't generate a ".", so just emit a new label here and refer
	// to it.
	MCSymbol *DotSym = OutContext.createTempSymbol();
	OutStreamer->EmitLabel(DotSym);

	// Now that we have emitted the label, lower the complex operand expression.
	MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));

	const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
	const MCExpr *PICBase =
	MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
	DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);

	DotExpr = MCBinaryExpr::createAdd(
	MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);

	EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
	.addReg(MI->getOperand(0).getReg())
	.addReg(MI->getOperand(1).getReg())
	.addExpr(DotExpr));
	return;
	}
	case TargetOpcode::STATEPOINT:
	return LowerSTATEPOINT(*MI, MCInstLowering);

	case TargetOpcode::FAULTING_OP:
	return LowerFAULTING_OP(*MI, MCInstLowering);

	case TargetOpcode::FENTRY_CALL:
	return LowerFENTRY_CALL(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_OP:
	return LowerPATCHABLE_OP(*MI, MCInstLowering);

	case TargetOpcode::STACKMAP:
	return LowerSTACKMAP(*MI);

	case TargetOpcode::PATCHPOINT:
	return LowerPATCHPOINT(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
	return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_RET:
	return LowerPATCHABLE_RET(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_TAIL_CALL:
	return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
	return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);

	case X86::MORESTACK_RET:
	EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
	return;

	case X86::MORESTACK_RET_RESTORE_R10:
	// Return, then restore R10.
	EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
	EmitAndCountInstruction(
	MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
	return;

	case X86::SEH_PushReg:
	case X86::SEH_SaveReg:
	case X86::SEH_SaveXMM:
	case X86::SEH_StackAlloc:
	case X86::SEH_StackAlign:
	case X86::SEH_SetFrame:
	case X86::SEH_PushFrame:
	case X86::SEH_EndPrologue:
	EmitSEHInstruction(MI);
	return;

	case X86::SEH_Epilogue: {
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	MachineBasicBlock::const_iterator MBBI(MI);
	// Check if preceded by a call and emit nop if so.
	for (MBBI = PrevCrossBBInst(MBBI);
	MBBI != MachineBasicBlock::const_iterator();
	MBBI = PrevCrossBBInst(MBBI)) {
	// Conservatively assume that pseudo instructions don't emit code and keep
	// looking for a call. We may emit an unnecessary nop in some cases.
	if (!MBBI->isPseudo()) {
	if (MBBI->isCall())
	EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
	break;
	}
	}
	return;
	}

	// Lower PSHUFB and VPERMILP normally but add a comment if we can find
	// a constant shuffle mask. We won't be able to do this at the MC layer
	// because the mask isn't an immediate.
	case X86::PSHUFBrm:
	case X86::VPSHUFBrm:
	case X86::VPSHUFBYrm:
	case X86::VPSHUFBZ128rm:
	case X86::VPSHUFBZ128rmk:
	case X86::VPSHUFBZ128rmkz:
	case X86::VPSHUFBZ256rm:
	case X86::VPSHUFBZ256rmk:
	case X86::VPSHUFBZ256rmkz:
	case X86::VPSHUFBZrm:
	case X86::VPSHUFBZrmk:
	case X86::VPSHUFBZrmkz: {
	if (!OutStreamer->isVerboseAsm())
	break;
	unsigned SrcIdx, MaskIdx;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::PSHUFBrm:
	case X86::VPSHUFBrm:
	case X86::VPSHUFBYrm:
	case X86::VPSHUFBZ128rm:
	case X86::VPSHUFBZ256rm:
	case X86::VPSHUFBZrm:
	SrcIdx = 1; MaskIdx = 5; break;
	case X86::VPSHUFBZ128rmkz:
	case X86::VPSHUFBZ256rmkz:
	case X86::VPSHUFBZrmkz:
	SrcIdx = 2; MaskIdx = 6; break;
	case X86::VPSHUFBZ128rmk:
	case X86::VPSHUFBZ256rmk:
	case X86::VPSHUFBZrmk:
	SrcIdx = 3; MaskIdx = 7; break;
	}

	assert(MI->getNumOperands() >= 6 &&
	"We should always have at least 6 operands!");

	const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
	SmallVector<int, 64> Mask;
	DecodePSHUFBMask(C, Width, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
	}
	break;
	}

	case X86::VPERMILPSrm:
	case X86::VPERMILPSYrm:
	case X86::VPERMILPSZ128rm:
	case X86::VPERMILPSZ128rmk:
	case X86::VPERMILPSZ128rmkz:
	case X86::VPERMILPSZ256rm:
	case X86::VPERMILPSZ256rmk:
	case X86::VPERMILPSZ256rmkz:
	case X86::VPERMILPSZrm:
	case X86::VPERMILPSZrmk:
	case X86::VPERMILPSZrmkz:
	case X86::VPERMILPDrm:
	case X86::VPERMILPDYrm:
	case X86::VPERMILPDZ128rm:
	case X86::VPERMILPDZ128rmk:
	case X86::VPERMILPDZ128rmkz:
	case X86::VPERMILPDZ256rm:
	case X86::VPERMILPDZ256rmk:
	case X86::VPERMILPDZ256rmkz:
	case X86::VPERMILPDZrm:
	case X86::VPERMILPDZrmk:
	case X86::VPERMILPDZrmkz: {
	if (!OutStreamer->isVerboseAsm())
	break;
	unsigned SrcIdx, MaskIdx;
	unsigned ElSize;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPERMILPSrm:
	case X86::VPERMILPSYrm:
	case X86::VPERMILPSZ128rm:
	case X86::VPERMILPSZ256rm:
	case X86::VPERMILPSZrm:
	SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
	case X86::VPERMILPSZ128rmkz:
	case X86::VPERMILPSZ256rmkz:
	case X86::VPERMILPSZrmkz:
	SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
	case X86::VPERMILPSZ128rmk:
	case X86::VPERMILPSZ256rmk:
	case X86::VPERMILPSZrmk:
	SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
	case X86::VPERMILPDrm:
	case X86::VPERMILPDYrm:
	case X86::VPERMILPDZ128rm:
	case X86::VPERMILPDZ256rm:
	case X86::VPERMILPDZrm:
	SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
	case X86::VPERMILPDZ128rmkz:
	case X86::VPERMILPDZ256rmkz:
	case X86::VPERMILPDZrmkz:
	SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
	case X86::VPERMILPDZ128rmk:
	case X86::VPERMILPDZ256rmk:
	case X86::VPERMILPDZrmk:
	SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
	}

	assert(MI->getNumOperands() >= 6 &&
	"We should always have at least 6 operands!");

	const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
	SmallVector<int, 16> Mask;
	DecodeVPERMILPMask(C, ElSize, Width, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
	}
	break;
	}

	case X86::VPERMIL2PDrm:
	case X86::VPERMIL2PSrm:
	case X86::VPERMIL2PDYrm:
	case X86::VPERMIL2PSYrm: {
	if (!OutStreamer->isVerboseAsm())
	break;
	assert(MI->getNumOperands() >= 8 &&
	"We should always have at least 8 operands!");

	const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
	if (!CtrlOp.isImm())
	break;

	unsigned ElSize;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break;
	case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
	}

	const MachineOperand &MaskOp = MI->getOperand(6);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
	SmallVector<int, 16> Mask;
	DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
	}
	break;
	}

	case X86::VPPERMrrm: {
	if (!OutStreamer->isVerboseAsm())
	break;
	assert(MI->getNumOperands() >= 7 &&
	"We should always have at least 7 operands!");

	const MachineOperand &MaskOp = MI->getOperand(6);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
	SmallVector<int, 16> Mask;
	DecodeVPPERMMask(C, Width, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
	}
	break;
	}

	case X86::MMX_MOVQ64rm: {
	if (!OutStreamer->isVerboseAsm())
	break;
	if (MI->getNumOperands() <= 4)
	break;
	if (auto C = getConstantFromPool(MI, MI->getOperand(4))) {
	std::string Comment;
	raw_string_ostream CS(Comment);
	const MachineOperand &DstOp = MI->getOperand(0);
	CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
	if (auto *CF = dyn_cast<ConstantFP>(C)) {
	CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
	OutStreamer->AddComment(CS.str());
	}
	}
	break;
	}

	#define MOV_CASE(Prefix, Suffix) \
	case X86::Prefix##MOVAPD##Suffix##rm: \
	case X86::Prefix##MOVAPS##Suffix##rm: \
	case X86::Prefix##MOVUPD##Suffix##rm: \
	case X86::Prefix##MOVUPS##Suffix##rm: \
	case X86::Prefix##MOVDQA##Suffix##rm: \
	case X86::Prefix##MOVDQU##Suffix##rm:

	#define MOV_AVX512_CASE(Suffix) \
	case X86::VMOVDQA64##Suffix##rm: \
	case X86::VMOVDQA32##Suffix##rm: \
	case X86::VMOVDQU64##Suffix##rm: \
	case X86::VMOVDQU32##Suffix##rm: \
	case X86::VMOVDQU16##Suffix##rm: \
	case X86::VMOVDQU8##Suffix##rm: \
	case X86::VMOVAPS##Suffix##rm: \
	case X86::VMOVAPD##Suffix##rm: \
	case X86::VMOVUPS##Suffix##rm: \
	case X86::VMOVUPD##Suffix##rm:

	#define CASE_ALL_MOV_RM() \
	MOV_CASE(, ) /* SSE */ \
	MOV_CASE(V, ) /* AVX-128 */ \
	MOV_CASE(V, Y) /* AVX-256 */ \
	MOV_AVX512_CASE(Z) \
	MOV_AVX512_CASE(Z256) \
	MOV_AVX512_CASE(Z128)

	// For loads from a constant pool to a vector register, print the constant
	// loaded.
	CASE_ALL_MOV_RM()
	case X86::VBROADCASTF128:
	case X86::VBROADCASTI128:
	case X86::VBROADCASTF32X4Z256rm:
	case X86::VBROADCASTF32X4rm:
	case X86::VBROADCASTF32X8rm:
	case X86::VBROADCASTF64X2Z128rm:
	case X86::VBROADCASTF64X2rm:
	case X86::VBROADCASTF64X4rm:
	case X86::VBROADCASTI32X4Z256rm:
	case X86::VBROADCASTI32X4rm:
	case X86::VBROADCASTI32X8rm:
	case X86::VBROADCASTI64X2Z128rm:
	case X86::VBROADCASTI64X2rm:
	case X86::VBROADCASTI64X4rm:
	if (!OutStreamer->isVerboseAsm())
	break;
	if (MI->getNumOperands() <= 4)
	break;
	if (auto C = getConstantFromPool(MI, MI->getOperand(4))) {
	int NumLanes = 1;
	// Override NumLanes for the broadcast instructions.
	switch (MI->getOpcode()) {
	case X86::VBROADCASTF128: NumLanes = 2; break;
	case X86::VBROADCASTI128: NumLanes = 2; break;
	case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break;
	case X86::VBROADCASTF32X4rm: NumLanes = 4; break;
	case X86::VBROADCASTF32X8rm: NumLanes = 2; break;
	case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break;
	case X86::VBROADCASTF64X2rm: NumLanes = 4; break;
	case X86::VBROADCASTF64X4rm: NumLanes = 2; break;
	case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break;
	case X86::VBROADCASTI32X4rm: NumLanes = 4; break;
	case X86::VBROADCASTI32X8rm: NumLanes = 2; break;
	case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break;
	case X86::VBROADCASTI64X2rm: NumLanes = 4; break;
	case X86::VBROADCASTI64X4rm: NumLanes = 2; break;
	}

	std::string Comment;
	raw_string_ostream CS(Comment);
	const MachineOperand &DstOp = MI->getOperand(0);
	CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
	if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
	CS << "[";
	for (int l = 0; l != NumLanes; ++l) {
	for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements;
	++i) {
	if (i != 0 \|\| l != 0)
	CS << ",";
	if (CDS->getElementType()->isIntegerTy())
	printConstant(CDS->getElementAsAPInt(i), CS);
	else if (CDS->getElementType()->isHalfTy() \|\|
	CDS->getElementType()->isFloatTy() \|\|
	CDS->getElementType()->isDoubleTy())
	printConstant(CDS->getElementAsAPFloat(i), CS);
	else
	CS << "?";
	}
	}
	CS << "]";
	OutStreamer->AddComment(CS.str());
	} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
	CS << "<";
	for (int l = 0; l != NumLanes; ++l) {
	for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands;
	++i) {
	if (i != 0 \|\| l != 0)
	CS << ",";
	printConstant(CV->getOperand(i), CS);
	}
	}
	CS << ">";
	OutStreamer->AddComment(CS.str());
	}
	}
	break;
	case X86::MOVDDUPrm:
	case X86::VMOVDDUPrm:
	case X86::VMOVDDUPZ128rm:
	case X86::VBROADCASTSSrm:
	case X86::VBROADCASTSSYrm:
	case X86::VBROADCASTSSZ128m:
	case X86::VBROADCASTSSZ256m:
	case X86::VBROADCASTSSZm:
	case X86::VBROADCASTSDYrm:
	case X86::VBROADCASTSDZ256m:
	case X86::VBROADCASTSDZm:
	case X86::VPBROADCASTBrm:
	case X86::VPBROADCASTBYrm:
	case X86::VPBROADCASTBZ128m:
	case X86::VPBROADCASTBZ256m:
	case X86::VPBROADCASTBZm:
	case X86::VPBROADCASTDrm:
	case X86::VPBROADCASTDYrm:
	case X86::VPBROADCASTDZ128m:
	case X86::VPBROADCASTDZ256m:
	case X86::VPBROADCASTDZm:
	case X86::VPBROADCASTQrm:
	case X86::VPBROADCASTQYrm:
	case X86::VPBROADCASTQZ128m:
	case X86::VPBROADCASTQZ256m:
	case X86::VPBROADCASTQZm:
	case X86::VPBROADCASTWrm:
	case X86::VPBROADCASTWYrm:
	case X86::VPBROADCASTWZ128m:
	case X86::VPBROADCASTWZ256m:
	case X86::VPBROADCASTWZm:
	if (!OutStreamer->isVerboseAsm())
	break;
	if (MI->getNumOperands() <= 4)
	break;
	if (auto C = getConstantFromPool(MI, MI->getOperand(4))) {
	int NumElts;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::MOVDDUPrm: NumElts = 2; break;
	case X86::VMOVDDUPrm: NumElts = 2; break;
	case X86::VMOVDDUPZ128rm: NumElts = 2; break;
	case X86::VBROADCASTSSrm: NumElts = 4; break;
	case X86::VBROADCASTSSYrm: NumElts = 8; break;
	case X86::VBROADCASTSSZ128m: NumElts = 4; break;
	case X86::VBROADCASTSSZ256m: NumElts = 8; break;
	case X86::VBROADCASTSSZm: NumElts = 16; break;
	case X86::VBROADCASTSDYrm: NumElts = 4; break;
	case X86::VBROADCASTSDZ256m: NumElts = 4; break;
	case X86::VBROADCASTSDZm: NumElts = 8; break;
	case X86::VPBROADCASTBrm: NumElts = 16; break;
	case X86::VPBROADCASTBYrm: NumElts = 32; break;
	case X86::VPBROADCASTBZ128m: NumElts = 16; break;
	case X86::VPBROADCASTBZ256m: NumElts = 32; break;
	case X86::VPBROADCASTBZm: NumElts = 64; break;
	case X86::VPBROADCASTDrm: NumElts = 4; break;
	case X86::VPBROADCASTDYrm: NumElts = 8; break;
	case X86::VPBROADCASTDZ128m: NumElts = 4; break;
	case X86::VPBROADCASTDZ256m: NumElts = 8; break;
	case X86::VPBROADCASTDZm: NumElts = 16; break;
	case X86::VPBROADCASTQrm: NumElts = 2; break;
	case X86::VPBROADCASTQYrm: NumElts = 4; break;
	case X86::VPBROADCASTQZ128m: NumElts = 2; break;
	case X86::VPBROADCASTQZ256m: NumElts = 4; break;
	case X86::VPBROADCASTQZm: NumElts = 8; break;
	case X86::VPBROADCASTWrm: NumElts = 8; break;
	case X86::VPBROADCASTWYrm: NumElts = 16; break;
	case X86::VPBROADCASTWZ128m: NumElts = 8; break;
	case X86::VPBROADCASTWZ256m: NumElts = 16; break;
	case X86::VPBROADCASTWZm: NumElts = 32; break;
	}

	std::string Comment;
	raw_string_ostream CS(Comment);
	const MachineOperand &DstOp = MI->getOperand(0);
	CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
	CS << "[";
	for (int i = 0; i != NumElts; ++i) {
	if (i != 0)
	CS << ",";
	printConstant(C, CS);
	}
	CS << "]";
	OutStreamer->AddComment(CS.str());
	}
	}

	MCInst TmpInst;
	MCInstLowering.Lower(MI, TmpInst);

	// Stackmap shadows cannot include branch targets, so we can count the bytes
	// in a call towards the shadow, but must ensure that the no thread returns
	// in to the stackmap shadow. The only way to achieve this is if the call
	// is at the end of the shadow.
	if (MI->isCall()) {
	// Count then size of the call towards the shadow
	SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
	// Then flush the shadow so that we fill with nops before the call, not
	// after it.
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
	// Then emit the call
	OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
	return;
	}

	EmitAndCountInstruction(TmpInst);
	}
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h (revision 362609)
	@@ -1,879 +1,902 @@
	//===-- X86Subtarget.h - Define Subtarget for the X86 ----------- C++ ---===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the X86 specific subclass of TargetSubtargetInfo.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
	#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H

	#include "X86FrameLowering.h"
	#include "X86ISelLowering.h"
	#include "X86InstrInfo.h"
	#include "X86SelectionDAGInfo.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/GlobalISel/CallLowering.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
	#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
	#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/Target/TargetMachine.h"
	#include <climits>
	#include <memory>

	#define GET_SUBTARGETINFO_HEADER
	#include "X86GenSubtargetInfo.inc"

	namespace llvm {

	class GlobalValue;

	/// The X86 backend supports a number of different styles of PIC.
	///
	namespace PICStyles {

	enum class Style {
	StubPIC, // Used on i386-darwin in pic mode.
	GOT, // Used on 32 bit elf on when in pic mode.
	RIPRel, // Used on X86-64 when in pic mode.
	None // Set when not in pic mode.
	};

	} // end namespace PICStyles

	class X86Subtarget final : public X86GenSubtargetInfo {
	public:
	// NOTE: Do not add anything new to this list. Coarse, CPU name based flags
	// are not a good idea. We should be migrating away from these.
	enum X86ProcFamilyEnum {
	Others,
	IntelAtom,
	IntelSLM
	};

	protected:
	enum X86SSEEnum {
	NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
	};

	enum X863DNowEnum {
	NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
	};

	/// X86 processor family: Intel Atom, and others
	X86ProcFamilyEnum X86ProcFamily = Others;

	/// Which PIC style to use
	PICStyles::Style PICStyle;

	const TargetMachine &TM;

	/// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
	X86SSEEnum X86SSELevel = NoSSE;

	/// MMX, 3DNow, 3DNow Athlon, or none supported.
	X863DNowEnum X863DNowLevel = NoThreeDNow;

	/// True if the processor supports X87 instructions.
	bool HasX87 = false;

	/// True if the processor supports CMPXCHG8B.
	bool HasCmpxchg8b = false;

	/// True if this processor has NOPL instruction
	/// (generally pentium pro+).
	bool HasNOPL = false;

	/// True if this processor has conditional move instructions
	/// (generally pentium pro+).
	bool HasCMov = false;

	/// True if the processor supports X86-64 instructions.
	bool HasX86_64 = false;

	/// True if the processor supports POPCNT.
	bool HasPOPCNT = false;

	/// True if the processor supports SSE4A instructions.
	bool HasSSE4A = false;

	/// Target has AES instructions
	bool HasAES = false;
	bool HasVAES = false;

	/// Target has FXSAVE/FXRESTOR instructions
	bool HasFXSR = false;

	/// Target has XSAVE instructions
	bool HasXSAVE = false;

	/// Target has XSAVEOPT instructions
	bool HasXSAVEOPT = false;

	/// Target has XSAVEC instructions
	bool HasXSAVEC = false;

	/// Target has XSAVES instructions
	bool HasXSAVES = false;

	/// Target has carry-less multiplication
	bool HasPCLMUL = false;
	bool HasVPCLMULQDQ = false;

	/// Target has Galois Field Arithmetic instructions
	bool HasGFNI = false;

	/// Target has 3-operand fused multiply-add
	bool HasFMA = false;

	/// Target has 4-operand fused multiply-add
	bool HasFMA4 = false;

	/// Target has XOP instructions
	bool HasXOP = false;

	/// Target has TBM instructions.
	bool HasTBM = false;

	/// Target has LWP instructions
	bool HasLWP = false;

	/// True if the processor has the MOVBE instruction.
	bool HasMOVBE = false;

	/// True if the processor has the RDRAND instruction.
	bool HasRDRAND = false;

	/// Processor has 16-bit floating point conversion instructions.
	bool HasF16C = false;

	/// Processor has FS/GS base insturctions.
	bool HasFSGSBase = false;

	/// Processor has LZCNT instruction.
	bool HasLZCNT = false;

	/// Processor has BMI1 instructions.
	bool HasBMI = false;

	/// Processor has BMI2 instructions.
	bool HasBMI2 = false;

	/// Processor has VBMI instructions.
	bool HasVBMI = false;

	/// Processor has VBMI2 instructions.
	bool HasVBMI2 = false;

	/// Processor has Integer Fused Multiply Add
	bool HasIFMA = false;

	/// Processor has RTM instructions.
	bool HasRTM = false;

	/// Processor has ADX instructions.
	bool HasADX = false;

	/// Processor has SHA instructions.
	bool HasSHA = false;

	/// Processor has PRFCHW instructions.
	bool HasPRFCHW = false;

	/// Processor has RDSEED instructions.
	bool HasRDSEED = false;

	/// Processor has LAHF/SAHF instructions.
	bool HasLAHFSAHF = false;

	/// Processor has MONITORX/MWAITX instructions.
	bool HasMWAITX = false;

	/// Processor has Cache Line Zero instruction
	bool HasCLZERO = false;

	/// Processor has Cache Line Demote instruction
	bool HasCLDEMOTE = false;

	/// Processor has MOVDIRI instruction (direct store integer).
	bool HasMOVDIRI = false;

	/// Processor has MOVDIR64B instruction (direct store 64 bytes).
	bool HasMOVDIR64B = false;

	/// Processor has ptwrite instruction.
	bool HasPTWRITE = false;

	/// Processor has Prefetch with intent to Write instruction
	bool HasPREFETCHWT1 = false;

	/// True if SHLD instructions are slow.
	bool IsSHLDSlow = false;

	/// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
	// PMULUDQ.
	bool IsPMULLDSlow = false;

	/// True if the PMADDWD instruction is slow compared to PMULLD.
	bool IsPMADDWDSlow = false;

	/// True if unaligned memory accesses of 16-bytes are slow.
	bool IsUAMem16Slow = false;

	/// True if unaligned memory accesses of 32-bytes are slow.
	bool IsUAMem32Slow = false;

	/// True if SSE operations can have unaligned memory operands.
	/// This may require setting a configuration bit in the processor.
	bool HasSSEUnalignedMem = false;

	/// True if this processor has the CMPXCHG16B instruction;
	/// this is true for most x86-64 chips, but not the first AMD chips.
	bool HasCmpxchg16b = false;

	/// True if the LEA instruction should be used for adjusting
	/// the stack pointer. This is an optimization for Intel Atom processors.
	bool UseLeaForSP = false;

	/// True if POPCNT instruction has a false dependency on the destination register.
	bool HasPOPCNTFalseDeps = false;

	/// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
	bool HasLZCNTFalseDeps = false;

	/// True if its preferable to combine to a single shuffle using a variable
	/// mask over multiple fixed shuffles.
	bool HasFastVariableShuffle = false;

	/// True if vzeroupper instructions should be inserted after code that uses
	/// ymm or zmm registers.
	bool InsertVZEROUPPER = false;

	/// True if there is no performance penalty for writing NOPs with up to
	/// 11 bytes.
	bool HasFast11ByteNOP = false;

	/// True if there is no performance penalty for writing NOPs with up to
	/// 15 bytes.
	bool HasFast15ByteNOP = false;

	/// True if gather is reasonably fast. This is true for Skylake client and
	/// all AVX-512 CPUs.
	bool HasFastGather = false;

	/// True if hardware SQRTSS instruction is at least as fast (latency) as
	/// RSQRTSS followed by a Newton-Raphson iteration.
	bool HasFastScalarFSQRT = false;

	/// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
	/// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
	bool HasFastVectorFSQRT = false;

	/// True if 8-bit divisions are significantly faster than
	/// 32-bit divisions and should be used when possible.
	bool HasSlowDivide32 = false;

	/// True if 32-bit divides are significantly faster than
	/// 64-bit divisions and should be used when possible.
	bool HasSlowDivide64 = false;

	/// True if LZCNT instruction is fast.
	bool HasFastLZCNT = false;

	/// True if SHLD based rotate is fast.
	bool HasFastSHLDRotate = false;

	/// True if the processor supports macrofusion.
	bool HasMacroFusion = false;

	/// True if the processor supports branch fusion.
	bool HasBranchFusion = false;

	/// True if the processor has enhanced REP MOVSB/STOSB.
	bool HasERMSB = false;

	/// True if the short functions should be padded to prevent
	/// a stall when returning too early.
	bool PadShortFunctions = false;

	/// True if two memory operand instructions should use a temporary register
	/// instead.
	bool SlowTwoMemOps = false;

	/// True if the LEA instruction inputs have to be ready at address generation
	/// (AG) time.
	bool LEAUsesAG = false;

	/// True if the LEA instruction with certain arguments is slow
	bool SlowLEA = false;

	/// True if the LEA instruction has all three source operands: base, index,
	/// and offset or if the LEA instruction uses base and index registers where
	/// the base is EBP, RBP,or R13
	bool Slow3OpsLEA = false;

	/// True if INC and DEC instructions are slow when writing to flags
	bool SlowIncDec = false;

	/// Processor has AVX-512 PreFetch Instructions
	bool HasPFI = false;

	/// Processor has AVX-512 Exponential and Reciprocal Instructions
	bool HasERI = false;

	/// Processor has AVX-512 Conflict Detection Instructions
	bool HasCDI = false;

	/// Processor has AVX-512 population count Instructions
	bool HasVPOPCNTDQ = false;

	/// Processor has AVX-512 Doubleword and Quadword instructions
	bool HasDQI = false;

	/// Processor has AVX-512 Byte and Word instructions
	bool HasBWI = false;

	/// Processor has AVX-512 Vector Length eXtenstions
	bool HasVLX = false;

	/// Processor has PKU extenstions
	bool HasPKU = false;

	/// Processor has AVX-512 Vector Neural Network Instructions
	bool HasVNNI = false;

	/// Processor has AVX-512 bfloat16 floating-point extensions
	bool HasBF16 = false;

	/// Processor supports ENQCMD instructions
	bool HasENQCMD = false;

	/// Processor has AVX-512 Bit Algorithms instructions
	bool HasBITALG = false;

	/// Processor has AVX-512 vp2intersect instructions
	bool HasVP2INTERSECT = false;

	/// Deprecated flag for MPX instructions.
	bool DeprecatedHasMPX = false;

	/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
	/// using Shadow Stack
	bool HasSHSTK = false;

	/// Processor supports Invalidate Process-Context Identifier
	bool HasINVPCID = false;

	/// Processor has Software Guard Extensions
	bool HasSGX = false;

	/// Processor supports Flush Cache Line instruction
	bool HasCLFLUSHOPT = false;

	/// Processor supports Cache Line Write Back instruction
	bool HasCLWB = false;

	/// Processor supports Write Back No Invalidate instruction
	bool HasWBNOINVD = false;

	/// Processor support RDPID instruction
	bool HasRDPID = false;

	/// Processor supports WaitPKG instructions
	bool HasWAITPKG = false;

	/// Processor supports PCONFIG instruction
	bool HasPCONFIG = false;

	/// Processor has a single uop BEXTR implementation.
	bool HasFastBEXTR = false;

	/// Try harder to combine to horizontal vector ops if they are fast.
	bool HasFastHorizontalOps = false;

	/// Prefer a left/right scalar logical shifts pair over a shift+and pair.
	bool HasFastScalarShiftMasks = false;

	/// Prefer a left/right vector logical shifts pair over a shift+and pair.
	bool HasFastVectorShiftMasks = false;

	/// Use a retpoline thunk rather than indirect calls to block speculative
	/// execution.
	bool UseRetpolineIndirectCalls = false;

	/// Use a retpoline thunk or remove any indirect branch to block speculative
	/// execution.
	bool UseRetpolineIndirectBranches = false;

	/// Deprecated flag, query `UseRetpolineIndirectCalls` and
	/// `UseRetpolineIndirectBranches` instead.
	bool DeprecatedUseRetpoline = false;

	/// When using a retpoline thunk, call an externally provided thunk rather
	/// than emitting one inside the compiler.
	bool UseRetpolineExternalThunk = false;

	+ /// Prevent generation of indirect call/branch instructions from memory,
	+ /// and force all indirect call/branch instructions from a register to be
	+ /// preceded by an LFENCE. Also decompose RET instructions into a
	+ /// POP+LFENCE+JMP sequence.
	+ bool UseLVIControlFlowIntegrity = false;
	+
	+ /// Insert LFENCE instructions to prevent data speculatively injected into
	+ /// loads from being used maliciously.
	+ bool UseLVILoadHardening = false;
	+
	/// Use software floating point for code generation.
	bool UseSoftFloat = false;

	/// Use alias analysis during code generation.
	bool UseAA = false;

	/// The minimum alignment known to hold of the stack frame on
	/// entry to the function and which must be maintained by every function.
	Align stackAlignment = Align(4);

	/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
	///
	// FIXME: this is a known good value for Yonah. How about others?
	unsigned MaxInlineSizeThreshold = 128;

	/// Indicates target prefers 128 bit instructions.
	bool Prefer128Bit = false;

	/// Indicates target prefers 256 bit instructions.
	bool Prefer256Bit = false;

	/// Indicates target prefers AVX512 mask registers.
	bool PreferMaskRegisters = false;

	/// Threeway branch is profitable in this subtarget.
	bool ThreewayBranchProfitable = false;

	/// Use Goldmont specific floating point div/sqrt costs.
	bool UseGLMDivSqrtCosts = false;

	/// What processor and OS we're targeting.
	Triple TargetTriple;

	/// GlobalISel related APIs.
	std::unique_ptr<CallLowering> CallLoweringInfo;
	std::unique_ptr<LegalizerInfo> Legalizer;
	std::unique_ptr<RegisterBankInfo> RegBankInfo;
	std::unique_ptr<InstructionSelector> InstSelector;

	private:
	/// Override the stack alignment.
	MaybeAlign StackAlignOverride;

	/// Preferred vector width from function attribute.
	unsigned PreferVectorWidthOverride;

	/// Resolved preferred vector width from function attribute and subtarget
	/// features.
	unsigned PreferVectorWidth = UINT32_MAX;

	/// Required vector width from function attribute.
	unsigned RequiredVectorWidth;

	/// True if compiling for 64-bit, false for 16-bit or 32-bit.
	bool In64BitMode;

	/// True if compiling for 32-bit, false for 16-bit or 64-bit.
	bool In32BitMode;

	/// True if compiling for 16-bit, false for 32-bit or 64-bit.
	bool In16BitMode;

	/// Contains the Overhead of gather\scatter instructions
	int GatherOverhead = 1024;
	int ScatterOverhead = 1024;

	X86SelectionDAGInfo TSInfo;
	// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
	// X86TargetLowering needs.
	X86InstrInfo InstrInfo;
	X86TargetLowering TLInfo;
	X86FrameLowering FrameLowering;

	public:
	/// This constructor initializes the data members to match that
	/// of the specified triple.
	///
	X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
	const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
	unsigned PreferVectorWidthOverride,
	unsigned RequiredVectorWidth);

	const X86TargetLowering *getTargetLowering() const override {
	return &TLInfo;
	}

	const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }

	const X86FrameLowering *getFrameLowering() const override {
	return &FrameLowering;
	}

	const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
	return &TSInfo;
	}

	const X86RegisterInfo *getRegisterInfo() const override {
	return &getInstrInfo()->getRegisterInfo();
	}

	/// Returns the minimum alignment known to hold of the
	/// stack frame on entry to the function and which must be maintained by every
	/// function for this subtarget.
	Align getStackAlignment() const { return stackAlignment; }

	/// Returns the maximum memset / memcpy size
	/// that still makes it profitable to inline the call.
	unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }

	/// ParseSubtargetFeatures - Parses features string setting specified
	/// subtarget options. Definition of function is auto generated by tblgen.
	void ParseSubtargetFeatures(StringRef CPU, StringRef FS);

	/// Methods used by Global ISel
	const CallLowering *getCallLowering() const override;
	InstructionSelector *getInstructionSelector() const override;
	const LegalizerInfo *getLegalizerInfo() const override;
	const RegisterBankInfo *getRegBankInfo() const override;

	private:
	/// Initialize the full set of dependencies so we can use an initializer
	/// list for X86Subtarget.
	X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
	void initSubtargetFeatures(StringRef CPU, StringRef FS);

	public:
	/// Is this x86_64? (disregarding specific ABI / programming model)
	bool is64Bit() const {
	return In64BitMode;
	}

	bool is32Bit() const {
	return In32BitMode;
	}

	bool is16Bit() const {
	return In16BitMode;
	}

	/// Is this x86_64 with the ILP32 programming model (x32 ABI)?
	bool isTarget64BitILP32() const {
	return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 \|\|
	TargetTriple.isOSNaCl());
	}

	/// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
	bool isTarget64BitLP64() const {
	return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
	!TargetTriple.isOSNaCl());
	}

	PICStyles::Style getPICStyle() const { return PICStyle; }
	void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }

	bool hasX87() const { return HasX87; }
	bool hasCmpxchg8b() const { return HasCmpxchg8b; }
	bool hasNOPL() const { return HasNOPL; }
	// SSE codegen depends on cmovs, and all SSE1+ processors support them.
	// All 64-bit processors support cmov.
	bool hasCMov() const { return HasCMov \|\| X86SSELevel >= SSE1 \|\| is64Bit(); }
	bool hasSSE1() const { return X86SSELevel >= SSE1; }
	bool hasSSE2() const { return X86SSELevel >= SSE2; }
	bool hasSSE3() const { return X86SSELevel >= SSE3; }
	bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
	bool hasSSE41() const { return X86SSELevel >= SSE41; }
	bool hasSSE42() const { return X86SSELevel >= SSE42; }
	bool hasAVX() const { return X86SSELevel >= AVX; }
	bool hasAVX2() const { return X86SSELevel >= AVX2; }
	bool hasAVX512() const { return X86SSELevel >= AVX512F; }
	bool hasInt256() const { return hasAVX2(); }
	bool hasSSE4A() const { return HasSSE4A; }
	bool hasMMX() const { return X863DNowLevel >= MMX; }
	bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
	bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
	bool hasPOPCNT() const { return HasPOPCNT; }
	bool hasAES() const { return HasAES; }
	bool hasVAES() const { return HasVAES; }
	bool hasFXSR() const { return HasFXSR; }
	bool hasXSAVE() const { return HasXSAVE; }
	bool hasXSAVEOPT() const { return HasXSAVEOPT; }
	bool hasXSAVEC() const { return HasXSAVEC; }
	bool hasXSAVES() const { return HasXSAVES; }
	bool hasPCLMUL() const { return HasPCLMUL; }
	bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
	bool hasGFNI() const { return HasGFNI; }
	// Prefer FMA4 to FMA - its better for commutation/memory folding and
	// has equal or better performance on all supported targets.
	bool hasFMA() const { return HasFMA; }
	bool hasFMA4() const { return HasFMA4; }
	bool hasAnyFMA() const { return hasFMA() \|\| hasFMA4(); }
	bool hasXOP() const { return HasXOP; }
	bool hasTBM() const { return HasTBM; }
	bool hasLWP() const { return HasLWP; }
	bool hasMOVBE() const { return HasMOVBE; }
	bool hasRDRAND() const { return HasRDRAND; }
	bool hasF16C() const { return HasF16C; }
	bool hasFSGSBase() const { return HasFSGSBase; }
	bool hasLZCNT() const { return HasLZCNT; }
	bool hasBMI() const { return HasBMI; }
	bool hasBMI2() const { return HasBMI2; }
	bool hasVBMI() const { return HasVBMI; }
	bool hasVBMI2() const { return HasVBMI2; }
	bool hasIFMA() const { return HasIFMA; }
	bool hasRTM() const { return HasRTM; }
	bool hasADX() const { return HasADX; }
	bool hasSHA() const { return HasSHA; }
	bool hasPRFCHW() const { return HasPRFCHW \|\| HasPREFETCHWT1; }
	bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
	bool hasSSEPrefetch() const {
	// We implicitly enable these when we have a write prefix supporting cache
	// level OR if we have prfchw, but don't already have a read prefetch from
	// 3dnow.
	return hasSSE1() \|\| (hasPRFCHW() && !has3DNow()) \|\| hasPREFETCHWT1();
	}
	bool hasRDSEED() const { return HasRDSEED; }
	bool hasLAHFSAHF() const { return HasLAHFSAHF; }
	bool hasMWAITX() const { return HasMWAITX; }
	bool hasCLZERO() const { return HasCLZERO; }
	bool hasCLDEMOTE() const { return HasCLDEMOTE; }
	bool hasMOVDIRI() const { return HasMOVDIRI; }
	bool hasMOVDIR64B() const { return HasMOVDIR64B; }
	bool hasPTWRITE() const { return HasPTWRITE; }
	bool isSHLDSlow() const { return IsSHLDSlow; }
	bool isPMULLDSlow() const { return IsPMULLDSlow; }
	bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
	bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
	bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
	int getGatherOverhead() const { return GatherOverhead; }
	int getScatterOverhead() const { return ScatterOverhead; }
	bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
	bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
	bool useLeaForSP() const { return UseLeaForSP; }
	bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
	bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
	bool hasFastVariableShuffle() const {
	return HasFastVariableShuffle;
	}
	bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
	bool hasFastGather() const { return HasFastGather; }
	bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
	bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
	bool hasFastLZCNT() const { return HasFastLZCNT; }
	bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
	bool hasFastBEXTR() const { return HasFastBEXTR; }
	bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
	bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
	bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
	bool hasMacroFusion() const { return HasMacroFusion; }
	bool hasBranchFusion() const { return HasBranchFusion; }
	bool hasERMSB() const { return HasERMSB; }
	bool hasSlowDivide32() const { return HasSlowDivide32; }
	bool hasSlowDivide64() const { return HasSlowDivide64; }
	bool padShortFunctions() const { return PadShortFunctions; }
	bool slowTwoMemOps() const { return SlowTwoMemOps; }
	bool LEAusesAG() const { return LEAUsesAG; }
	bool slowLEA() const { return SlowLEA; }
	bool slow3OpsLEA() const { return Slow3OpsLEA; }
	bool slowIncDec() const { return SlowIncDec; }
	bool hasCDI() const { return HasCDI; }
	bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
	bool hasPFI() const { return HasPFI; }
	bool hasERI() const { return HasERI; }
	bool hasDQI() const { return HasDQI; }
	bool hasBWI() const { return HasBWI; }
	bool hasVLX() const { return HasVLX; }
	bool hasPKU() const { return HasPKU; }
	bool hasVNNI() const { return HasVNNI; }
	bool hasBF16() const { return HasBF16; }
	bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
	bool hasBITALG() const { return HasBITALG; }
	bool hasSHSTK() const { return HasSHSTK; }
	bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
	bool hasCLWB() const { return HasCLWB; }
	bool hasWBNOINVD() const { return HasWBNOINVD; }
	bool hasRDPID() const { return HasRDPID; }
	bool hasWAITPKG() const { return HasWAITPKG; }
	bool hasPCONFIG() const { return HasPCONFIG; }
	bool hasSGX() const { return HasSGX; }
	bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
	bool hasINVPCID() const { return HasINVPCID; }
	bool hasENQCMD() const { return HasENQCMD; }
	bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
	bool useRetpolineIndirectBranches() const {
	return UseRetpolineIndirectBranches;
	}
	bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
	+
	+ // These are generic getters that OR together all of the thunk types
	+ // supported by the subtarget. Therefore useIndirectThunk*() will return true
	+ // if any respective thunk feature is enabled.
	+ bool useIndirectThunkCalls() const {
	+ return useRetpolineIndirectCalls() \|\| useLVIControlFlowIntegrity();
	+ }
	+ bool useIndirectThunkBranches() const {
	+ return useRetpolineIndirectBranches() \|\| useLVIControlFlowIntegrity();
	+ }
	+
	bool preferMaskRegisters() const { return PreferMaskRegisters; }
	bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
	+ bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
	+ bool useLVILoadHardening() const { return UseLVILoadHardening; }

	unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
	unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }

	// Helper functions to determine when we should allow widening to 512-bit
	// during codegen.
	// TODO: Currently we're always allowing widening on CPUs without VLX,
	// because for many cases we don't have a better option.
	bool canExtendTo512DQ() const {
	return hasAVX512() && (!hasVLX() \|\| getPreferVectorWidth() >= 512);
	}
	bool canExtendTo512BW() const {
	return hasBWI() && canExtendTo512DQ();
	}

	// If there are no 512-bit vectors and we prefer not to use 512-bit registers,
	// disable them in the legalizer.
	bool useAVX512Regs() const {
	return hasAVX512() && (canExtendTo512DQ() \|\| RequiredVectorWidth > 256);
	}

	bool useBWIRegs() const {
	return hasBWI() && useAVX512Regs();
	}

	bool isXRaySupported() const override { return is64Bit(); }

	X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }

	/// TODO: to be removed later and replaced with suitable properties
	bool isAtom() const { return X86ProcFamily == IntelAtom; }
	bool isSLM() const { return X86ProcFamily == IntelSLM; }
	bool useSoftFloat() const { return UseSoftFloat; }
	bool useAA() const override { return UseAA; }

	/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
	/// no-sse2). There isn't any reason to disable it if the target processor
	/// supports it.
	bool hasMFence() const { return hasSSE2() \|\| is64Bit(); }

	const Triple &getTargetTriple() const { return TargetTriple; }

	bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
	bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
	bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
	bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
	bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }

	bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
	bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
	bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }

	bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
	bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
	bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
	bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
	bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
	bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
	bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
	bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
	bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }

	bool isTargetWindowsMSVC() const {
	return TargetTriple.isWindowsMSVCEnvironment();
	}

	bool isTargetWindowsCoreCLR() const {
	return TargetTriple.isWindowsCoreCLREnvironment();
	}

	bool isTargetWindowsCygwin() const {
	return TargetTriple.isWindowsCygwinEnvironment();
	}

	bool isTargetWindowsGNU() const {
	return TargetTriple.isWindowsGNUEnvironment();
	}

	bool isTargetWindowsItanium() const {
	return TargetTriple.isWindowsItaniumEnvironment();
	}

	bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }

	bool isOSWindows() const { return TargetTriple.isOSWindows(); }

	bool isTargetWin64() const { return In64BitMode && isOSWindows(); }

	bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }

	bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; }
	bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; }

	bool isPICStyleStubPIC() const {
	return PICStyle == PICStyles::Style::StubPIC;
	}

	bool isPositionIndependent() const { return TM.isPositionIndependent(); }

	bool isCallingConvWin64(CallingConv::ID CC) const {
	switch (CC) {
	// On Win64, all these conventions just use the default convention.
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::Tail:
	case CallingConv::Swift:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::Intel_OCL_BI:
	return isTargetWin64();
	// This convention allows using the Win64 convention on other targets.
	case CallingConv::Win64:
	return true;
	// This convention allows using the SysV convention on Windows targets.
	case CallingConv::X86_64_SysV:
	return false;
	// Otherwise, who knows what this is.
	default:
	return false;
	}
	}

	/// Classify a global variable reference for the current subtarget according
	/// to how we should reference it in a non-pcrel context.
	unsigned char classifyLocalReference(const GlobalValue *GV) const;

	unsigned char classifyGlobalReference(const GlobalValue *GV,
	const Module &M) const;
	unsigned char classifyGlobalReference(const GlobalValue *GV) const;

	/// Classify a global function reference for the current subtarget.
	unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
	const Module &M) const;
	unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;

	/// Classify a blockaddress reference for the current subtarget according to
	/// how we should reference it in a non-pcrel context.
	unsigned char classifyBlockAddressReference() const;

	/// Return true if the subtarget allows calls to immediate address.
	bool isLegalToCallImmediateAddr() const;

	- /// If we are using retpolines, we need to expand indirectbr to avoid it
	+ /// If we are using indirect thunks, we need to expand indirectbr to avoid it
	/// lowering to an actual indirect jump.
	bool enableIndirectBrExpand() const override {
	- return useRetpolineIndirectBranches();
	+ return useIndirectThunkBranches();
	}

	/// Enable the MachineScheduler pass for all X86 subtargets.
	bool enableMachineScheduler() const override { return true; }

	bool enableEarlyIfConversion() const override;

	void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
	&Mutations) const override;

	AntiDepBreakMode getAntiDepBreakMode() const override {
	return TargetSubtargetInfo::ANTIDEP_CRITICAL;
	}

	bool enableAdvancedRASplitCost() const override { return true; }
	};

	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
	Index: head/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp (revision 362609)
	@@ -1,549 +1,556 @@
	//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the X86 specific subclass of TargetMachine.
	//
	//===----------------------------------------------------------------------===//

	#include "X86TargetMachine.h"
	#include "MCTargetDesc/X86MCTargetDesc.h"
	#include "TargetInfo/X86TargetInfo.h"
	#include "X86.h"
	#include "X86CallLowering.h"
	#include "X86LegalizerInfo.h"
	#include "X86MacroFusion.h"
	#include "X86Subtarget.h"
	#include "X86TargetObjectFile.h"
	#include "X86TargetTransformInfo.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/ExecutionDomainFix.h"
	#include "llvm/CodeGen/GlobalISel/CallLowering.h"
	#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
	#include "llvm/CodeGen/GlobalISel/Legalizer.h"
	#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
	#include "llvm/CodeGen/MachineScheduler.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Transforms/CFGuard.h"
	#include <memory>
	#include <string>

	using namespace llvm;

	static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
	cl::desc("Enable the machine combiner pass"),
	cl::init(true), cl::Hidden);

	static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
	cl::desc("Enable the conditional branch "
	"folding pass"),
	cl::init(false), cl::Hidden);

	extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
	// Register the target.
	RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
	RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());

	PassRegistry &PR = *PassRegistry::getPassRegistry();
	initializeGlobalISel(PR);
	initializeWinEHStatePassPass(PR);
	initializeFixupBWInstPassPass(PR);
	initializeEvexToVexInstPassPass(PR);
	initializeFixupLEAPassPass(PR);
	initializeFPSPass(PR);
	initializeX86CallFrameOptimizationPass(PR);
	initializeX86CmovConverterPassPass(PR);
	initializeX86ExpandPseudoPass(PR);
	initializeX86ExecutionDomainFixPass(PR);
	initializeX86DomainReassignmentPass(PR);
	initializeX86AvoidSFBPassPass(PR);
	initializeX86SpeculativeLoadHardeningPassPass(PR);
	initializeX86FlagsCopyLoweringPassPass(PR);
	initializeX86CondBrFoldingPassPass(PR);
	+ initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
	+ initializeX86LoadValueInjectionRetHardeningPassPass(PR);
	initializeX86OptimizeLEAPassPass(PR);
	}

	static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
	if (TT.isOSBinFormatMachO()) {
	if (TT.getArch() == Triple::x86_64)
	return std::make_unique<X86_64MachoTargetObjectFile>();
	return std::make_unique<TargetLoweringObjectFileMachO>();
	}

	if (TT.isOSFreeBSD())
	return std::make_unique<X86FreeBSDTargetObjectFile>();
	if (TT.isOSLinux() \|\| TT.isOSNaCl() \|\| TT.isOSIAMCU())
	return std::make_unique<X86LinuxNaClTargetObjectFile>();
	if (TT.isOSSolaris())
	return std::make_unique<X86SolarisTargetObjectFile>();
	if (TT.isOSFuchsia())
	return std::make_unique<X86FuchsiaTargetObjectFile>();
	if (TT.isOSBinFormatELF())
	return std::make_unique<X86ELFTargetObjectFile>();
	if (TT.isOSBinFormatCOFF())
	return std::make_unique<TargetLoweringObjectFileCOFF>();
	llvm_unreachable("unknown subtarget type");
	}

	static std::string computeDataLayout(const Triple &TT) {
	// X86 is little endian
	std::string Ret = "e";

	Ret += DataLayout::getManglingComponent(TT);
	// X86 and x32 have 32 bit pointers.
	if ((TT.isArch64Bit() &&
	(TT.getEnvironment() == Triple::GNUX32 \|\| TT.isOSNaCl())) \|\|
	!TT.isArch64Bit())
	Ret += "-p:32:32";

	// Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
	Ret += "-p270:32:32-p271:32:32-p272:64:64";

	// Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
	if (TT.isArch64Bit() \|\| TT.isOSWindows() \|\| TT.isOSNaCl())
	Ret += "-i64:64";
	else if (TT.isOSIAMCU())
	Ret += "-i64:32-f64:32";
	else
	Ret += "-f64:32:64";

	// Some ABIs align long double to 128 bits, others to 32.
	if (TT.isOSNaCl() \|\| TT.isOSIAMCU())
	; // No f80
	else if (TT.isArch64Bit() \|\| TT.isOSDarwin())
	Ret += "-f80:128";
	else
	Ret += "-f80:32";

	if (TT.isOSIAMCU())
	Ret += "-f128:32";

	// The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
	if (TT.isArch64Bit())
	Ret += "-n8:16:32:64";
	else
	Ret += "-n8:16:32";

	// The stack is aligned to 32 bits on some ABIs and 128 bits on others.
	if ((!TT.isArch64Bit() && TT.isOSWindows()) \|\| TT.isOSIAMCU())
	Ret += "-a:0:32-S32";
	else
	Ret += "-S128";

	return Ret;
	}

	static Reloc::Model getEffectiveRelocModel(const Triple &TT,
	bool JIT,
	Optional<Reloc::Model> RM) {
	bool is64Bit = TT.getArch() == Triple::x86_64;
	if (!RM.hasValue()) {
	// JIT codegen should use static relocations by default, since it's
	// typically executed in process and not relocatable.
	if (JIT)
	return Reloc::Static;

	// Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
	// Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
	// use static relocation model by default.
	if (TT.isOSDarwin()) {
	if (is64Bit)
	return Reloc::PIC_;
	return Reloc::DynamicNoPIC;
	}
	if (TT.isOSWindows() && is64Bit)
	return Reloc::PIC_;
	return Reloc::Static;
	}

	// ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
	// is defined as a model for code which may be used in static or dynamic
	// executables but not necessarily a shared library. On X86-32 we just
	// compile in -static mode, in x86-64 we use PIC.
	if (*RM == Reloc::DynamicNoPIC) {
	if (is64Bit)
	return Reloc::PIC_;
	if (!TT.isOSDarwin())
	return Reloc::Static;
	}

	// If we are on Darwin, disallow static relocation model in X86-64 mode, since
	// the Mach-O file format doesn't support it.
	if (*RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
	return Reloc::PIC_;

	return *RM;
	}

	static CodeModel::Model getEffectiveX86CodeModel(Optional<CodeModel::Model> CM,
	bool JIT, bool Is64Bit) {
	if (CM) {
	if (*CM == CodeModel::Tiny)
	report_fatal_error("Target does not support the tiny CodeModel", false);
	return *CM;
	}
	if (JIT)
	return Is64Bit ? CodeModel::Large : CodeModel::Small;
	return CodeModel::Small;
	}

	/// Create an X86 target.
	///
	X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
	StringRef CPU, StringRef FS,
	const TargetOptions &Options,
	Optional<Reloc::Model> RM,
	Optional<CodeModel::Model> CM,
	CodeGenOpt::Level OL, bool JIT)
	: LLVMTargetMachine(
	T, computeDataLayout(TT), TT, CPU, FS, Options,
	getEffectiveRelocModel(TT, JIT, RM),
	getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
	OL),
	TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
	// On PS4, the "return address" of a 'noreturn' call must still be within
	// the calling function, and TrapUnreachable is an easy way to get that.
	if (TT.isPS4() \|\| TT.isOSBinFormatMachO()) {
	this->Options.TrapUnreachable = true;
	this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
	}

	setMachineOutliner(true);

	initAsmInfo();
	}

	X86TargetMachine::~X86TargetMachine() = default;

	const X86Subtarget *
	X86TargetMachine::getSubtargetImpl(const Function &F) const {
	Attribute CPUAttr = F.getFnAttribute("target-cpu");
	Attribute FSAttr = F.getFnAttribute("target-features");

	StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
	? CPUAttr.getValueAsString()
	: (StringRef)TargetCPU;
	StringRef FS = !FSAttr.hasAttribute(Attribute::None)
	? FSAttr.getValueAsString()
	: (StringRef)TargetFS;

	SmallString<512> Key;
	Key.reserve(CPU.size() + FS.size());
	Key += CPU;
	Key += FS;

	// FIXME: This is related to the code below to reset the target options,
	// we need to know whether or not the soft float flag is set on the
	// function before we can generate a subtarget. We also need to use
	// it as a key for the subtarget since that can be the only difference
	// between two functions.
	bool SoftFloat =
	F.getFnAttribute("use-soft-float").getValueAsString() == "true";
	// If the soft float attribute is set on the function turn on the soft float
	// subtarget feature.
	if (SoftFloat)
	Key += FS.empty() ? "+soft-float" : ",+soft-float";

	// Keep track of the key width after all features are added so we can extract
	// the feature string out later.
	unsigned CPUFSWidth = Key.size();

	// Extract prefer-vector-width attribute.
	unsigned PreferVectorWidthOverride = 0;
	if (F.hasFnAttribute("prefer-vector-width")) {
	StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString();
	unsigned Width;
	if (!Val.getAsInteger(0, Width)) {
	Key += ",prefer-vector-width=";
	Key += Val;
	PreferVectorWidthOverride = Width;
	}
	}

	// Extract min-legal-vector-width attribute.
	unsigned RequiredVectorWidth = UINT32_MAX;
	if (F.hasFnAttribute("min-legal-vector-width")) {
	StringRef Val =
	F.getFnAttribute("min-legal-vector-width").getValueAsString();
	unsigned Width;
	if (!Val.getAsInteger(0, Width)) {
	Key += ",min-legal-vector-width=";
	Key += Val;
	RequiredVectorWidth = Width;
	}
	}

	// Extracted here so that we make sure there is backing for the StringRef. If
	// we assigned earlier, its possible the SmallString reallocated leaving a
	// dangling StringRef.
	FS = Key.slice(CPU.size(), CPUFSWidth);

	auto &I = SubtargetMap[Key];
	if (!I) {
	// This needs to be done before we create a new subtarget since any
	// creation will depend on the TM and the code generation flags on the
	// function that reside in TargetOptions.
	resetTargetOptions(F);
	I = std::make_unique<X86Subtarget>(
	TargetTriple, CPU, FS, *this,
	MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
	RequiredVectorWidth);
	}
	return I.get();
	}

	//===----------------------------------------------------------------------===//
	// Command line options for x86
	//===----------------------------------------------------------------------===//
	static cl::opt<bool>
	UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
	cl::desc("Minimize AVX to SSE transition penalty"),
	cl::init(true));

	//===----------------------------------------------------------------------===//
	// X86 TTI query.
	//===----------------------------------------------------------------------===//

	TargetTransformInfo
	X86TargetMachine::getTargetTransformInfo(const Function &F) {
	return TargetTransformInfo(X86TTIImpl(this, F));
	}

	//===----------------------------------------------------------------------===//
	// Pass Pipeline Configuration
	//===----------------------------------------------------------------------===//

	namespace {

	/// X86 Code Generator Pass Configuration Options.
	class X86PassConfig : public TargetPassConfig {
	public:
	X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
	: TargetPassConfig(TM, PM) {}

	X86TargetMachine &getX86TargetMachine() const {
	return getTM<X86TargetMachine>();
	}

	ScheduleDAGInstrs *
	createMachineScheduler(MachineSchedContext *C) const override {
	ScheduleDAGMILive *DAG = createGenericSchedLive(C);
	DAG->addMutation(createX86MacroFusionDAGMutation());
	return DAG;
	}

	ScheduleDAGInstrs *
	createPostMachineScheduler(MachineSchedContext *C) const override {
	ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
	DAG->addMutation(createX86MacroFusionDAGMutation());
	return DAG;
	}

	void addIRPasses() override;
	bool addInstSelector() override;
	bool addIRTranslator() override;
	bool addLegalizeMachineIR() override;
	bool addRegBankSelect() override;
	bool addGlobalInstructionSelect() override;
	bool addILPOpts() override;
	bool addPreISel() override;
	void addMachineSSAOptimization() override;
	void addPreRegAlloc() override;
	void addPostRegAlloc() override;
	void addPreEmitPass() override;
	void addPreEmitPass2() override;
	void addPreSched2() override;

	std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
	};

	class X86ExecutionDomainFix : public ExecutionDomainFix {
	public:
	static char ID;
	X86ExecutionDomainFix() : ExecutionDomainFix(ID, X86::VR128XRegClass) {}
	StringRef getPassName() const override {
	return "X86 Execution Dependency Fix";
	}
	};
	char X86ExecutionDomainFix::ID;

	} // end anonymous namespace

	INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
	"X86 Execution Domain Fix", false, false)
	INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
	INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
	"X86 Execution Domain Fix", false, false)

	TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
	return new X86PassConfig(*this, PM);
	}

	void X86PassConfig::addIRPasses() {
	addPass(createAtomicExpandPass());

	TargetPassConfig::addIRPasses();

	if (TM->getOptLevel() != CodeGenOpt::None)
	addPass(createInterleavedAccessPass());

	// Add passes that handle indirect branch removal and insertion of a retpoline
	// thunk. These will be a no-op unless a function subtarget has the retpoline
	// feature enabled.
	addPass(createIndirectBrExpandPass());

	// Add Control Flow Guard checks.
	const Triple &TT = TM->getTargetTriple();
	if (TT.isOSWindows()) {
	if (TT.getArch() == Triple::x86_64) {
	addPass(createCFGuardDispatchPass());
	} else {
	addPass(createCFGuardCheckPass());
	}
	}
	}

	bool X86PassConfig::addInstSelector() {
	// Install an instruction selector.
	addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));

	// For ELF, cleanup any local-dynamic TLS accesses.
	if (TM->getTargetTriple().isOSBinFormatELF() &&
	getOptLevel() != CodeGenOpt::None)
	addPass(createCleanupLocalDynamicTLSPass());

	addPass(createX86GlobalBaseRegPass());
	return false;
	}

	bool X86PassConfig::addIRTranslator() {
	addPass(new IRTranslator());
	return false;
	}

	bool X86PassConfig::addLegalizeMachineIR() {
	addPass(new Legalizer());
	return false;
	}

	bool X86PassConfig::addRegBankSelect() {
	addPass(new RegBankSelect());
	return false;
	}

	bool X86PassConfig::addGlobalInstructionSelect() {
	addPass(new InstructionSelect());
	return false;
	}

	bool X86PassConfig::addILPOpts() {
	if (EnableCondBrFoldingPass)
	addPass(createX86CondBrFolding());
	addPass(&EarlyIfConverterID);
	if (EnableMachineCombinerPass)
	addPass(&MachineCombinerID);
	addPass(createX86CmovConverterPass());
	return true;
	}

	bool X86PassConfig::addPreISel() {
	// Only add this pass for 32-bit x86 Windows.
	const Triple &TT = TM->getTargetTriple();
	if (TT.isOSWindows() && TT.getArch() == Triple::x86)
	addPass(createX86WinEHStatePass());
	return true;
	}

	void X86PassConfig::addPreRegAlloc() {
	if (getOptLevel() != CodeGenOpt::None) {
	addPass(&LiveRangeShrinkID);
	addPass(createX86FixupSetCC());
	addPass(createX86OptimizeLEAs());
	addPass(createX86CallFrameOptimization());
	addPass(createX86AvoidStoreForwardingBlocks());
	}

	addPass(createX86SpeculativeLoadHardeningPass());
	addPass(createX86FlagsCopyLoweringPass());
	addPass(createX86WinAllocaExpander());
	}
	void X86PassConfig::addMachineSSAOptimization() {
	addPass(createX86DomainReassignmentPass());
	TargetPassConfig::addMachineSSAOptimization();
	}

	void X86PassConfig::addPostRegAlloc() {
	addPass(createX86FloatingPointStackifierPass());
	+ if (getOptLevel() != CodeGenOpt::None)
	+ addPass(createX86LoadValueInjectionLoadHardeningPass());
	+ else
	+ addPass(createX86LoadValueInjectionLoadHardeningUnoptimizedPass());
	}

	void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }

	void X86PassConfig::addPreEmitPass() {
	if (getOptLevel() != CodeGenOpt::None) {
	addPass(new X86ExecutionDomainFix());
	addPass(createBreakFalseDeps());
	}

	addPass(createX86IndirectBranchTrackingPass());

	if (UseVZeroUpper)
	addPass(createX86IssueVZeroUpperPass());

	if (getOptLevel() != CodeGenOpt::None) {
	addPass(createX86FixupBWInsts());
	addPass(createX86PadShortFunctions());
	addPass(createX86FixupLEAs());
	addPass(createX86EvexToVexInsts());
	}
	addPass(createX86DiscriminateMemOpsPass());
	addPass(createX86InsertPrefetchPass());
	}

	void X86PassConfig::addPreEmitPass2() {
	const Triple &TT = TM->getTargetTriple();
	const MCAsmInfo *MAI = TM->getMCAsmInfo();

	- addPass(createX86RetpolineThunksPass());
	+ addPass(createX86IndirectThunksPass());

	// Insert extra int3 instructions after trailing call instructions to avoid
	// issues in the unwinder.
	if (TT.isOSWindows() && TT.getArch() == Triple::x86_64)
	addPass(createX86AvoidTrailingCallPass());

	// Verify basic block incoming and outgoing cfa offset and register values and
	// correct CFA calculation rule where needed by inserting appropriate CFI
	// instructions.
	if (!TT.isOSDarwin() &&
	(!TT.isOSWindows() \|\|
	MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
	addPass(createCFIInstrInserter());
	// Identify valid longjmp targets for Windows Control Flow Guard.
	if (TT.isOSWindows())
	addPass(createCFGuardLongjmpPass());
	+ addPass(createX86LoadValueInjectionRetHardeningPass());
	}

	std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
	return getStandardCSEConfigForOpt(TM->getOptLevel());
	}
	Index: head/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
	===================================================================
	--- head/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp (revision 362608)
	+++ head/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp (revision 362609)
	@@ -1,2291 +1,2291 @@
	//===- InstCombineAddSub.cpp ------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the visit functions for add, fadd, sub, and fsub.
	//
	//===----------------------------------------------------------------------===//

	#include "InstCombineInternal.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/AlignOf.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/KnownBits.h"
	#include <cassert>
	#include <utility>

	using namespace llvm;
	using namespace PatternMatch;

	#define DEBUG_TYPE "instcombine"

	namespace {

	/// Class representing coefficient of floating-point addend.
	/// This class needs to be highly efficient, which is especially true for
	/// the constructor. As of I write this comment, the cost of the default
	/// constructor is merely 4-byte-store-zero (Assuming compiler is able to
	/// perform write-merging).
	///
	class FAddendCoef {
	public:
	// The constructor has to initialize a APFloat, which is unnecessary for
	// most addends which have coefficient either 1 or -1. So, the constructor
	// is expensive. In order to avoid the cost of the constructor, we should
	// reuse some instances whenever possible. The pre-created instances
	// FAddCombine::Add[0-5] embodies this idea.
	FAddendCoef() = default;
	~FAddendCoef();

	// If possible, don't define operator+/operator- etc because these
	// operators inevitably call FAddendCoef's constructor which is not cheap.
	void operator=(const FAddendCoef &A);
	void operator+=(const FAddendCoef &A);
	void operator*=(const FAddendCoef &S);

	void set(short C) {
	assert(!insaneIntVal(C) && "Insane coefficient");
	IsFp = false; IntVal = C;
	}

	void set(const APFloat& C);

	void negate();

	bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
	Value getValue(Type ) const;

	bool isOne() const { return isInt() && IntVal == 1; }
	bool isTwo() const { return isInt() && IntVal == 2; }
	bool isMinusOne() const { return isInt() && IntVal == -1; }
	bool isMinusTwo() const { return isInt() && IntVal == -2; }

	private:
	bool insaneIntVal(int V) { return V > 4 \|\| V < -4; }

	APFloat *getFpValPtr()
	{ return reinterpret_cast<APFloat *>(&FpValBuf.buffer[0]); }

	const APFloat *getFpValPtr() const
	{ return reinterpret_cast<const APFloat *>(&FpValBuf.buffer[0]); }

	const APFloat &getFpVal() const {
	assert(IsFp && BufHasFpVal && "Incorret state");
	return *getFpValPtr();
	}

	APFloat &getFpVal() {
	assert(IsFp && BufHasFpVal && "Incorret state");
	return *getFpValPtr();
	}

	bool isInt() const { return !IsFp; }

	// If the coefficient is represented by an integer, promote it to a
	// floating point.
	void convertToFpType(const fltSemantics &Sem);

	// Construct an APFloat from a signed integer.
	// TODO: We should get rid of this function when APFloat can be constructed
	// from an SIGNED integer.
	APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);

	bool IsFp = false;

	// True iff FpValBuf contains an instance of APFloat.
	bool BufHasFpVal = false;

	// The integer coefficient of an individual addend is either 1 or -1,
	// and we try to simplify at most 4 addends from neighboring at most
	// two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
	// is overkill of this end.
	short IntVal = 0;

	AlignedCharArrayUnion<APFloat> FpValBuf;
	};

	/// FAddend is used to represent floating-point addend. An addend is
	/// represented as <C, V>, where the V is a symbolic value, and C is a
	/// constant coefficient. A constant addend is represented as <C, 0>.
	class FAddend {
	public:
	FAddend() = default;

	void operator+=(const FAddend &T) {
	assert((Val == T.Val) && "Symbolic-values disagree");
	Coeff += T.Coeff;
	}

	Value *getSymVal() const { return Val; }
	const FAddendCoef &getCoef() const { return Coeff; }

	bool isConstant() const { return Val == nullptr; }
	bool isZero() const { return Coeff.isZero(); }

	void set(short Coefficient, Value *V) {
	Coeff.set(Coefficient);
	Val = V;
	}
	void set(const APFloat &Coefficient, Value *V) {
	Coeff.set(Coefficient);
	Val = V;
	}
	void set(const ConstantFP Coefficient, Value V) {
	Coeff.set(Coefficient->getValueAPF());
	Val = V;
	}

	void negate() { Coeff.negate(); }

	/// Drill down the U-D chain one step to find the definition of V, and
	/// try to break the definition into one or two addends.
	static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);

	/// Similar to FAddend::drillDownOneStep() except that the value being
	/// splitted is the addend itself.
	unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;

	private:
	void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }

	// This addend has the value of "Coeff * Val".
	Value *Val = nullptr;
	FAddendCoef Coeff;
	};

	/// FAddCombine is the class for optimizing an unsafe fadd/fsub along
	/// with its neighboring at most two instructions.
	///
	class FAddCombine {
	public:
	FAddCombine(InstCombiner::BuilderTy &B) : Builder(B) {}

	Value simplify(Instruction FAdd);

	private:
	using AddendVect = SmallVector<const FAddend *, 4>;

	Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);

	/// Convert given addend to a Value
	Value *createAddendVal(const FAddend &A, bool& NeedNeg);

	/// Return the number of instructions needed to emit the N-ary addition.
	unsigned calcInstrNumber(const AddendVect& Vect);

	Value createFSub(Value Opnd0, Value *Opnd1);
	Value createFAdd(Value Opnd0, Value *Opnd1);
	Value createFMul(Value Opnd0, Value *Opnd1);
	Value createFNeg(Value V);
	Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
	void createInstPostProc(Instruction *NewInst, bool NoNumber = false);

	// Debugging stuff are clustered here.
	#ifndef NDEBUG
	unsigned CreateInstrNum;
	void initCreateInstNum() { CreateInstrNum = 0; }
	void incCreateInstNum() { CreateInstrNum++; }
	#else
	void initCreateInstNum() {}
	void incCreateInstNum() {}
	#endif

	InstCombiner::BuilderTy &Builder;
	Instruction *Instr = nullptr;
	};

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	//
	// Implementation of
	// {FAddendCoef, FAddend, FAddition, FAddCombine}.
	//
	//===----------------------------------------------------------------------===//
	FAddendCoef::~FAddendCoef() {
	if (BufHasFpVal)
	getFpValPtr()->~APFloat();
	}

	void FAddendCoef::set(const APFloat& C) {
	APFloat *P = getFpValPtr();

	if (isInt()) {
	// As the buffer is meanless byte stream, we cannot call
	// APFloat::operator=().
	new(P) APFloat(C);
	} else
	*P = C;

	IsFp = BufHasFpVal = true;
	}

	void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
	if (!isInt())
	return;

	APFloat *P = getFpValPtr();
	if (IntVal > 0)
	new(P) APFloat(Sem, IntVal);
	else {
	new(P) APFloat(Sem, 0 - IntVal);
	P->changeSign();
	}
	IsFp = BufHasFpVal = true;
	}

	APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
	if (Val >= 0)
	return APFloat(Sem, Val);

	APFloat T(Sem, 0 - Val);
	T.changeSign();

	return T;
	}

	void FAddendCoef::operator=(const FAddendCoef &That) {
	if (That.isInt())
	set(That.IntVal);
	else
	set(That.getFpVal());
	}

	void FAddendCoef::operator+=(const FAddendCoef &That) {
	enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
	if (isInt() == That.isInt()) {
	if (isInt())
	IntVal += That.IntVal;
	else
	getFpVal().add(That.getFpVal(), RndMode);
	return;
	}

	if (isInt()) {
	const APFloat &T = That.getFpVal();
	convertToFpType(T.getSemantics());
	getFpVal().add(T, RndMode);
	return;
	}

	APFloat &T = getFpVal();
	T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
	}

	void FAddendCoef::operator*=(const FAddendCoef &That) {
	if (That.isOne())
	return;

	if (That.isMinusOne()) {
	negate();
	return;
	}

	if (isInt() && That.isInt()) {
	int Res = IntVal * (int)That.IntVal;
	assert(!insaneIntVal(Res) && "Insane int value");
	IntVal = Res;
	return;
	}

	const fltSemantics &Semantic =
	isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();

	if (isInt())
	convertToFpType(Semantic);
	APFloat &F0 = getFpVal();

	if (That.isInt())
	F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
	APFloat::rmNearestTiesToEven);
	else
	F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
	}

	void FAddendCoef::negate() {
	if (isInt())
	IntVal = 0 - IntVal;
	else
	getFpVal().changeSign();
	}

	Value FAddendCoef::getValue(Type Ty) const {
	return isInt() ?
	ConstantFP::get(Ty, float(IntVal)) :
	ConstantFP::get(Ty->getContext(), getFpVal());
	}

	// The definition of <Val> Addends
	// =========================================
	// A + B <1, A>, <1,B>
	// A - B <1, A>, <1,B>
	// 0 - B <-1, B>
	// C * A, <C, A>
	// A + C <1, A> <C, NULL>
	// 0 +/- 0 <0, NULL> (corner case)
	//
	// Legend: A and B are not constant, C is constant
	unsigned FAddend::drillValueDownOneStep
	(Value *Val, FAddend &Addend0, FAddend &Addend1) {
	Instruction *I = nullptr;
	if (!Val \|\| !(I = dyn_cast<Instruction>(Val)))
	return 0;

	unsigned Opcode = I->getOpcode();

	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub) {
	ConstantFP C0, C1;
	Value *Opnd0 = I->getOperand(0);
	Value *Opnd1 = I->getOperand(1);
	if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
	Opnd0 = nullptr;

	if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
	Opnd1 = nullptr;

	if (Opnd0) {
	if (!C0)
	Addend0.set(1, Opnd0);
	else
	Addend0.set(C0, nullptr);
	}

	if (Opnd1) {
	FAddend &Addend = Opnd0 ? Addend1 : Addend0;
	if (!C1)
	Addend.set(1, Opnd1);
	else
	Addend.set(C1, nullptr);
	if (Opcode == Instruction::FSub)
	Addend.negate();
	}

	if (Opnd0 \|\| Opnd1)
	return Opnd0 && Opnd1 ? 2 : 1;

	// Both operands are zero. Weird!
	Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr);
	return 1;
	}

	if (I->getOpcode() == Instruction::FMul) {
	Value *V0 = I->getOperand(0);
	Value *V1 = I->getOperand(1);
	if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
	Addend0.set(C, V1);
	return 1;
	}

	if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
	Addend0.set(C, V0);
	return 1;
	}
	}

	return 0;
	}

	// Try to break this addend into two addends. e.g. Suppose this addend is
	// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
	// i.e. <2.3, X> and <2.3, Y>.
	unsigned FAddend::drillAddendDownOneStep
	(FAddend &Addend0, FAddend &Addend1) const {
	if (isConstant())
	return 0;

	unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
	if (!BreakNum \|\| Coeff.isOne())
	return BreakNum;

	Addend0.Scale(Coeff);

	if (BreakNum == 2)
	Addend1.Scale(Coeff);

	return BreakNum;
	}

	Value FAddCombine::simplify(Instruction I) {
	assert(I->hasAllowReassoc() && I->hasNoSignedZeros() &&
	"Expected 'reassoc'+'nsz' instruction");

	// Currently we are not able to handle vector type.
	if (I->getType()->isVectorTy())
	return nullptr;

	assert((I->getOpcode() == Instruction::FAdd \|\|
	I->getOpcode() == Instruction::FSub) && "Expect add/sub");

	// Save the instruction before calling other member-functions.
	Instr = I;

	FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;

	unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);

	// Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
	unsigned Opnd0_ExpNum = 0;
	unsigned Opnd1_ExpNum = 0;

	if (!Opnd0.isConstant())
	Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);

	// Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
	if (OpndNum == 2 && !Opnd1.isConstant())
	Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);

	// Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
	if (Opnd0_ExpNum && Opnd1_ExpNum) {
	AddendVect AllOpnds;
	AllOpnds.push_back(&Opnd0_0);
	AllOpnds.push_back(&Opnd1_0);
	if (Opnd0_ExpNum == 2)
	AllOpnds.push_back(&Opnd0_1);
	if (Opnd1_ExpNum == 2)
	AllOpnds.push_back(&Opnd1_1);

	// Compute instruction quota. We should save at least one instruction.
	unsigned InstQuota = 0;

	Value *V0 = I->getOperand(0);
	Value *V1 = I->getOperand(1);
	InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
	(!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;

	if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
	return R;
	}

	if (OpndNum != 2) {
	// The input instruction is : "I=0.0 +/- V". If the "V" were able to be
	// splitted into two addends, say "V = X - Y", the instruction would have
	// been optimized into "I = Y - X" in the previous steps.
	//
	const FAddendCoef &CE = Opnd0.getCoef();
	return CE.isOne() ? Opnd0.getSymVal() : nullptr;
	}

	// step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
	if (Opnd1_ExpNum) {
	AddendVect AllOpnds;
	AllOpnds.push_back(&Opnd0);
	AllOpnds.push_back(&Opnd1_0);
	if (Opnd1_ExpNum == 2)
	AllOpnds.push_back(&Opnd1_1);

	if (Value *R = simplifyFAdd(AllOpnds, 1))
	return R;
	}

	// step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
	if (Opnd0_ExpNum) {
	AddendVect AllOpnds;
	AllOpnds.push_back(&Opnd1);
	AllOpnds.push_back(&Opnd0_0);
	if (Opnd0_ExpNum == 2)
	AllOpnds.push_back(&Opnd0_1);

	if (Value *R = simplifyFAdd(AllOpnds, 1))
	return R;
	}

	return nullptr;
	}

	Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
	unsigned AddendNum = Addends.size();
	assert(AddendNum <= 4 && "Too many addends");

	// For saving intermediate results;
	unsigned NextTmpIdx = 0;
	FAddend TmpResult[3];

	// Points to the constant addend of the resulting simplified expression.
	// If the resulting expr has constant-addend, this constant-addend is
	// desirable to reside at the top of the resulting expression tree. Placing
	// constant close to supper-expr(s) will potentially reveal some optimization
	// opportunities in super-expr(s).
	const FAddend *ConstAdd = nullptr;

	// Simplified addends are placed <SimpVect>.
	AddendVect SimpVect;

	// The outer loop works on one symbolic-value at a time. Suppose the input
	// addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
	// The symbolic-values will be processed in this order: x, y, z.
	for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {

	const FAddend *ThisAddend = Addends[SymIdx];
	if (!ThisAddend) {
	// This addend was processed before.
	continue;
	}

	Value *Val = ThisAddend->getSymVal();
	unsigned StartIdx = SimpVect.size();
	SimpVect.push_back(ThisAddend);

	// The inner loop collects addends sharing same symbolic-value, and these
	// addends will be later on folded into a single addend. Following above
	// example, if the symbolic value "y" is being processed, the inner loop
	// will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
	// be later on folded into "<b1+b2, y>".
	for (unsigned SameSymIdx = SymIdx + 1;
	SameSymIdx < AddendNum; SameSymIdx++) {
	const FAddend *T = Addends[SameSymIdx];
	if (T && T->getSymVal() == Val) {
	// Set null such that next iteration of the outer loop will not process
	// this addend again.
	Addends[SameSymIdx] = nullptr;
	SimpVect.push_back(T);
	}
	}

	// If multiple addends share same symbolic value, fold them together.
	if (StartIdx + 1 != SimpVect.size()) {
	FAddend &R = TmpResult[NextTmpIdx ++];
	R = *SimpVect[StartIdx];
	for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
	R += *SimpVect[Idx];

	// Pop all addends being folded and push the resulting folded addend.
	SimpVect.resize(StartIdx);
	if (Val) {
	if (!R.isZero()) {
	SimpVect.push_back(&R);
	}
	} else {
	// Don't push constant addend at this time. It will be the last element
	// of <SimpVect>.
	ConstAdd = &R;
	}
	}
	}

	assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
	"out-of-bound access");

	if (ConstAdd)
	SimpVect.push_back(ConstAdd);

	Value *Result;
	if (!SimpVect.empty())
	Result = createNaryFAdd(SimpVect, InstrQuota);
	else {
	// The addition is folded to 0.0.
	Result = ConstantFP::get(Instr->getType(), 0.0);
	}

	return Result;
	}

	Value *FAddCombine::createNaryFAdd
	(const AddendVect &Opnds, unsigned InstrQuota) {
	assert(!Opnds.empty() && "Expect at least one addend");

	// Step 1: Check if the # of instructions needed exceeds the quota.

	unsigned InstrNeeded = calcInstrNumber(Opnds);
	if (InstrNeeded > InstrQuota)
	return nullptr;

	initCreateInstNum();

	// step 2: Emit the N-ary addition.
	// Note that at most three instructions are involved in Fadd-InstCombine: the
	// addition in question, and at most two neighboring instructions.
	// The resulting optimized addition should have at least one less instruction
	// than the original addition expression tree. This implies that the resulting
	// N-ary addition has at most two instructions, and we don't need to worry
	// about tree-height when constructing the N-ary addition.

	Value *LastVal = nullptr;
	bool LastValNeedNeg = false;

	// Iterate the addends, creating fadd/fsub using adjacent two addends.
	for (const FAddend *Opnd : Opnds) {
	bool NeedNeg;
	Value V = createAddendVal(Opnd, NeedNeg);
	if (!LastVal) {
	LastVal = V;
	LastValNeedNeg = NeedNeg;
	continue;
	}

	if (LastValNeedNeg == NeedNeg) {
	LastVal = createFAdd(LastVal, V);
	continue;
	}

	if (LastValNeedNeg)
	LastVal = createFSub(V, LastVal);
	else
	LastVal = createFSub(LastVal, V);

	LastValNeedNeg = false;
	}

	if (LastValNeedNeg) {
	LastVal = createFNeg(LastVal);
	}

	#ifndef NDEBUG
	assert(CreateInstrNum == InstrNeeded &&
	"Inconsistent in instruction numbers");
	#endif

	return LastVal;
	}

	Value FAddCombine::createFSub(Value Opnd0, Value *Opnd1) {
	Value *V = Builder.CreateFSub(Opnd0, Opnd1);
	if (Instruction *I = dyn_cast<Instruction>(V))
	createInstPostProc(I);
	return V;
	}

	Value FAddCombine::createFNeg(Value V) {
	Value *Zero = cast<Value>(ConstantFP::getZeroValueForNegation(V->getType()));
	Value *NewV = createFSub(Zero, V);
	if (Instruction *I = dyn_cast<Instruction>(NewV))
	createInstPostProc(I, true); // fneg's don't receive instruction numbers.
	return NewV;
	}

	Value FAddCombine::createFAdd(Value Opnd0, Value *Opnd1) {
	Value *V = Builder.CreateFAdd(Opnd0, Opnd1);
	if (Instruction *I = dyn_cast<Instruction>(V))
	createInstPostProc(I);
	return V;
	}

	Value FAddCombine::createFMul(Value Opnd0, Value *Opnd1) {
	Value *V = Builder.CreateFMul(Opnd0, Opnd1);
	if (Instruction *I = dyn_cast<Instruction>(V))
	createInstPostProc(I);
	return V;
	}

	void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
	NewInstr->setDebugLoc(Instr->getDebugLoc());

	// Keep track of the number of instruction created.
	if (!NoNumber)
	incCreateInstNum();

	// Propagate fast-math flags
	NewInstr->setFastMathFlags(Instr->getFastMathFlags());
	}

	// Return the number of instruction needed to emit the N-ary addition.
	// NOTE: Keep this function in sync with createAddendVal().
	unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
	unsigned OpndNum = Opnds.size();
	unsigned InstrNeeded = OpndNum - 1;

	// The number of addends in the form of "(-1)*x".
	unsigned NegOpndNum = 0;

	// Adjust the number of instructions needed to emit the N-ary add.
	for (const FAddend *Opnd : Opnds) {
	if (Opnd->isConstant())
	continue;

	// The constant check above is really for a few special constant
	// coefficients.
	if (isa<UndefValue>(Opnd->getSymVal()))
	continue;

	const FAddendCoef &CE = Opnd->getCoef();
	if (CE.isMinusOne() \|\| CE.isMinusTwo())
	NegOpndNum++;

	// Let the addend be "c * x". If "c == +/-1", the value of the addend
	// is immediately available; otherwise, it needs exactly one instruction
	// to evaluate the value.
	if (!CE.isMinusOne() && !CE.isOne())
	InstrNeeded++;
	}
	if (NegOpndNum == OpndNum)
	InstrNeeded++;
	return InstrNeeded;
	}

	// Input Addend Value NeedNeg(output)
	// ================================================================
	// Constant C C false
	// <+/-1, V> V coefficient is -1
	// <2/-2, V> "fadd V, V" coefficient is -2
	// <C, V> "fmul V, C" false
	//
	// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
	Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
	const FAddendCoef &Coeff = Opnd.getCoef();

	if (Opnd.isConstant()) {
	NeedNeg = false;
	return Coeff.getValue(Instr->getType());
	}

	Value *OpndVal = Opnd.getSymVal();

	if (Coeff.isMinusOne() \|\| Coeff.isOne()) {
	NeedNeg = Coeff.isMinusOne();
	return OpndVal;
	}

	if (Coeff.isTwo() \|\| Coeff.isMinusTwo()) {
	NeedNeg = Coeff.isMinusTwo();
	return createFAdd(OpndVal, OpndVal);
	}

	NeedNeg = false;
	return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
	}

	// Checks if any operand is negative and we can convert add to sub.
	// This function checks for following negative patterns
	// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
	// ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
	// XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
	static Value *checkForNegativeOperand(BinaryOperator &I,
	InstCombiner::BuilderTy &Builder) {
	Value LHS = I.getOperand(0), RHS = I.getOperand(1);

	// This function creates 2 instructions to replace ADD, we need at least one
	// of LHS or RHS to have one use to ensure benefit in transform.
	if (!LHS->hasOneUse() && !RHS->hasOneUse())
	return nullptr;

	Value X = nullptr, Y = nullptr, *Z = nullptr;
	const APInt C1 = nullptr, C2 = nullptr;

	// if ONE is on other side, swap
	if (match(RHS, m_Add(m_Value(X), m_One())))
	std::swap(LHS, RHS);

	if (match(LHS, m_Add(m_Value(X), m_One()))) {
	// if XOR on other side, swap
	if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
	std::swap(X, RHS);

	if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) {
	// X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
	// ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
	if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (C2 == ~(C1))) {
	Value NewAnd = Builder.CreateAnd(Z, C1);
	return Builder.CreateSub(RHS, NewAnd, "sub");
	} else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (C1 == C2)) {
	// X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
	// ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
	Value NewOr = Builder.CreateOr(Z, ~(C1));
	return Builder.CreateSub(RHS, NewOr, "sub");
	}
	}
	}

	// Restore LHS and RHS
	LHS = I.getOperand(0);
	RHS = I.getOperand(1);

	// if XOR is on other side, swap
	if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
	std::swap(LHS, RHS);

	// C2 is ODD
	// LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2))
	// ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
	if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
	if (C1->countTrailingZeros() == 0)
	if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && C1 == (C2 + 1)) {
	Value NewOr = Builder.CreateOr(Z, ~(C2));
	return Builder.CreateSub(RHS, NewOr, "sub");
	}
	return nullptr;
	}

	/// Wrapping flags may allow combining constants separated by an extend.
	static Instruction *foldNoWrapAdd(BinaryOperator &Add,
	InstCombiner::BuilderTy &Builder) {
	Value Op0 = Add.getOperand(0), Op1 = Add.getOperand(1);
	Type *Ty = Add.getType();
	Constant *Op1C;
	if (!match(Op1, m_Constant(Op1C)))
	return nullptr;

	// Try this match first because it results in an add in the narrow type.
	// (zext (X +nuw C2)) + C1 --> zext (X + (C2 + trunc(C1)))
	Value *X;
	const APInt C1, C2;
	if (match(Op1, m_APInt(C1)) &&
	match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) &&
	C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) {
	Constant *NewC =
	ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth()));
	return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
	}

	// More general combining of constants in the wide type.
	// (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C)
	Constant *NarrowC;
	if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) {
	Constant *WideC = ConstantExpr::getSExt(NarrowC, Ty);
	Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
	Value *WideX = Builder.CreateSExt(X, Ty);
	return BinaryOperator::CreateAdd(WideX, NewC);
	}
	// (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C)
	if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) {
	Constant *WideC = ConstantExpr::getZExt(NarrowC, Ty);
	Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
	Value *WideX = Builder.CreateZExt(X, Ty);
	return BinaryOperator::CreateAdd(WideX, NewC);
	}

	return nullptr;
	}

	Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
	Value Op0 = Add.getOperand(0), Op1 = Add.getOperand(1);
	Constant *Op1C;
	if (!match(Op1, m_Constant(Op1C)))
	return nullptr;

	if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add))
	return NV;

	Value *X;
	Constant *Op00C;

	// add (sub C1, X), C2 --> sub (add C1, C2), X
	if (match(Op0, m_Sub(m_Constant(Op00C), m_Value(X))))
	return BinaryOperator::CreateSub(ConstantExpr::getAdd(Op00C, Op1C), X);

	Value *Y;

	// add (sub X, Y), -1 --> add (not Y), X
	if (match(Op0, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))) &&
	match(Op1, m_AllOnes()))
	return BinaryOperator::CreateAdd(Builder.CreateNot(Y), X);

	// zext(bool) + C -> bool ? C + 1 : C
	if (match(Op0, m_ZExt(m_Value(X))) &&
	X->getType()->getScalarSizeInBits() == 1)
	return SelectInst::Create(X, AddOne(Op1C), Op1);
	// sext(bool) + C -> bool ? C - 1 : C
	if (match(Op0, m_SExt(m_Value(X))) &&
	X->getType()->getScalarSizeInBits() == 1)
	return SelectInst::Create(X, SubOne(Op1C), Op1);

	// ~X + C --> (C-1) - X
	if (match(Op0, m_Not(m_Value(X))))
	return BinaryOperator::CreateSub(SubOne(Op1C), X);

	const APInt *C;
	if (!match(Op1, m_APInt(C)))
	return nullptr;

	// (X \| C2) + C --> (X \| C2) ^ C2 iff (C2 == -C)
	const APInt *C2;
	if (match(Op0, m_Or(m_Value(), m_APInt(C2))) && C2 == -C)
	return BinaryOperator::CreateXor(Op0, ConstantInt::get(Add.getType(), *C2));

	if (C->isSignMask()) {
	// If wrapping is not allowed, then the addition must set the sign bit:
	// X + (signmask) --> X \| signmask
	if (Add.hasNoSignedWrap() \|\| Add.hasNoUnsignedWrap())
	return BinaryOperator::CreateOr(Op0, Op1);

	// If wrapping is allowed, then the addition flips the sign bit of LHS:
	// X + (signmask) --> X ^ signmask
	return BinaryOperator::CreateXor(Op0, Op1);
	}

	// Is this add the last step in a convoluted sext?
	// add(zext(xor i16 X, -32768), -32768) --> sext X
	Type *Ty = Add.getType();
	if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
	C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
	return CastInst::Create(Instruction::SExt, X, Ty);

	if (C->isOneValue() && Op0->hasOneUse()) {
	// add (sext i1 X), 1 --> zext (not X)
	// TODO: The smallest IR representation is (select X, 0, 1), and that would
	// not require the one-use check. But we need to remove a transform in
	// visitSelect and make sure that IR value tracking for select is equal or
	// better than for these ops.
	if (match(Op0, m_SExt(m_Value(X))) &&
	X->getType()->getScalarSizeInBits() == 1)
	return new ZExtInst(Builder.CreateNot(X), Ty);

	// Shifts and add used to flip and mask off the low bit:
	// add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
	const APInt *C3;
	if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
	C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
	Value *NotX = Builder.CreateNot(X);
	return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
	}
	}

	return nullptr;
	}

	// Matches multiplication expression Op * C where C is a constant. Returns the
	// constant value in C and the other operand in Op. Returns true if such a
	// match is found.
	static bool MatchMul(Value E, Value &Op, APInt &C) {
	const APInt *AI;
	if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) {
	C = *AI;
	return true;
	}
	if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) {
	C = APInt(AI->getBitWidth(), 1);
	C <<= *AI;
	return true;
	}
	return false;
	}

	// Matches remainder expression Op % C where C is a constant. Returns the
	// constant value in C and the other operand in Op. Returns the signedness of
	// the remainder operation in IsSigned. Returns true if such a match is
	// found.
	static bool MatchRem(Value E, Value &Op, APInt &C, bool &IsSigned) {
	const APInt *AI;
	IsSigned = false;
	if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) {
	IsSigned = true;
	C = *AI;
	return true;
	}
	if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) {
	C = *AI;
	return true;
	}
	if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) {
	C = *AI + 1;
	return true;
	}
	return false;
	}

	// Matches division expression Op / C with the given signedness as indicated
	// by IsSigned, where C is a constant. Returns the constant value in C and the
	// other operand in Op. Returns true if such a match is found.
	static bool MatchDiv(Value E, Value &Op, APInt &C, bool IsSigned) {
	const APInt *AI;
	if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) {
	C = *AI;
	return true;
	}
	if (!IsSigned) {
	if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) {
	C = *AI;
	return true;
	}
	if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) {
	C = APInt(AI->getBitWidth(), 1);
	C <<= *AI;
	return true;
	}
	}
	return false;
	}

	// Returns whether C0 * C1 with the given signedness overflows.
	static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
	bool overflow;
	if (IsSigned)
	(void)C0.smul_ov(C1, overflow);
	else
	(void)C0.umul_ov(C1, overflow);
	return overflow;
	}

	// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
	// does not overflow.
	Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
	Value LHS = I.getOperand(0), RHS = I.getOperand(1);
	Value X, MulOpV;
	APInt C0, MulOpC;
	bool IsSigned;
	// Match I = X % C0 + MulOpV * C0
	if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) \|\|
	(MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) &&
	C0 == MulOpC) {
	Value *RemOpV;
	APInt C1;
	bool Rem2IsSigned;
	// Match MulOpC = RemOpV % C1
	if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) &&
	IsSigned == Rem2IsSigned) {
	Value *DivOpV;
	APInt DivOpC;
	// Match RemOpV = X / C0
	if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
	C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
	Value *NewDivisor =
	ConstantInt::get(X->getType()->getContext(), C0 * C1);
	return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
	: Builder.CreateURem(X, NewDivisor, "urem");
	}
	}
	}

	return nullptr;
	}

	/// Fold
	/// (1 << NBits) - 1
	/// Into:
	/// ~(-(1 << NBits))
	/// Because a 'not' is better for bit-tracking analysis and other transforms
	/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was.
	static Instruction *canonicalizeLowbitMask(BinaryOperator &I,
	InstCombiner::BuilderTy &Builder) {
	Value *NBits;
	if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes())))
	return nullptr;

	Constant *MinusOne = Constant::getAllOnesValue(NBits->getType());
	Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask");
	// Be wary of constant folding.
	if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) {
	// Always NSW. But NUW propagates from `add`.
	BOp->setHasNoSignedWrap();
	BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
	}

	return BinaryOperator::CreateNot(NotMask, I.getName());
	}

	static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
	assert(I.getOpcode() == Instruction::Add && "Expecting add instruction");
	Type *Ty = I.getType();
	auto getUAddSat = [&]() {
	return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty);
	};

	// add (umin X, ~Y), Y --> uaddsat X, Y
	Value X, Y;
	if (match(&I, m_c_Add(m_c_UMin(m_Value(X), m_Not(m_Value(Y))),
	m_Deferred(Y))))
	return CallInst::Create(getUAddSat(), { X, Y });

	// add (umin X, ~C), C --> uaddsat X, C
	const APInt C, NotC;
	if (match(&I, m_Add(m_UMin(m_Value(X), m_APInt(NotC)), m_APInt(C))) &&
	C == ~NotC)
	return CallInst::Create(getUAddSat(), { X, ConstantInt::get(Ty, *C) });

	return nullptr;
	}

	Instruction *
	InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
	BinaryOperator &I) {
	assert((I.getOpcode() == Instruction::Add \|\|
	I.getOpcode() == Instruction::Or \|\|
	I.getOpcode() == Instruction::Sub) &&
	"Expecting add/or/sub instruction");

	// We have a subtraction/addition between a (potentially truncated) logical
	// right-shift of X and a "select".
	Value X, Select;
	Instruction LowBitsToSkip, Extract;
	if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
	m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
	m_Instruction(Extract))),
	m_Value(Select))))
	return nullptr;

	// `add`/`or` is commutative; but for `sub`, "select" must be on RHS.
	if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select)
	return nullptr;

	Type *XTy = X->getType();
	bool HadTrunc = I.getType() != XTy;

	// If there was a truncation of extracted value, then we'll need to produce
	// one extra instruction, so we need to ensure one instruction will go away.
	if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
	return nullptr;

	// Extraction should extract high NBits bits, with shift amount calculated as:
	// low bits to skip = shift bitwidth - high bits to extract
	// The shift amount itself may be extended, and we need to look past zero-ext
	// when matching NBits, that will matter for matching later.
	Constant *C;
	Value *NBits;
	if (!match(
	LowBitsToSkip,
	m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) \|\|
	!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
	APInt(C->getType()->getScalarSizeInBits(),
	X->getType()->getScalarSizeInBits()))))
	return nullptr;

	// Sign-extending value can be zero-extended if we `sub`tract it,
	// or sign-extended otherwise.
	auto SkipExtInMagic = [&I](Value *&V) {
	if (I.getOpcode() == Instruction::Sub)
	match(V, m_ZExtOrSelf(m_Value(V)));
	else
	match(V, m_SExtOrSelf(m_Value(V)));
	};

	// Now, finally validate the sign-extending magic.
	// `select` itself may be appropriately extended, look past that.
	SkipExtInMagic(Select);

	ICmpInst::Predicate Pred;
	const APInt *Thr;
	Value SignExtendingValue, Zero;
	bool ShouldSignext;
	// It must be a select between two values we will later establish to be a
	// sign-extending value and a zero constant. The condition guarding the
	// sign-extension must be based on a sign bit of the same X we had in `lshr`.
	if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)),
	m_Value(SignExtendingValue), m_Value(Zero))) \|\|
	!isSignBitCheck(Pred, *Thr, ShouldSignext))
	return nullptr;

	// icmp-select pair is commutative.
	if (!ShouldSignext)
	std::swap(SignExtendingValue, Zero);

	// If we should not perform sign-extension then we must add/or/subtract zero.
	if (!match(Zero, m_Zero()))
	return nullptr;
	// Otherwise, it should be some constant, left-shifted by the same NBits we
	// had in `lshr`. Said left-shift can also be appropriately extended.
	// Again, we must look past zero-ext when looking for NBits.
	SkipExtInMagic(SignExtendingValue);
	Constant *SignExtendingValueBaseConstant;
	if (!match(SignExtendingValue,
	m_Shl(m_Constant(SignExtendingValueBaseConstant),
	m_ZExtOrSelf(m_Specific(NBits)))))
	return nullptr;
	// If we `sub`, then the constant should be one, else it should be all-ones.
	if (I.getOpcode() == Instruction::Sub
	? !match(SignExtendingValueBaseConstant, m_One())
	: !match(SignExtendingValueBaseConstant, m_AllOnes()))
	return nullptr;

	auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip,
	Extract->getName() + ".sext");
	NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness.
	if (!HadTrunc)
	return NewAShr;

	Builder.Insert(NewAShr);
	return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
	}

	Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
	if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
	I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (SimplifyAssociativeOrCommutative(I))
	return &I;

	if (Instruction *X = foldVectorBinop(I))
	return X;

	// (AB)+(AC) -> A*(B+C) etc
	if (Value *V = SimplifyUsingDistributiveLaws(I))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldAddWithConstant(I))
	return X;

	if (Instruction *X = foldNoWrapAdd(I, Builder))
	return X;

	// FIXME: This should be moved into the above helper function to allow these
	// transforms for general constant or constant splat vectors.
	Value LHS = I.getOperand(0), RHS = I.getOperand(1);
	Type *Ty = I.getType();
	if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
	Value XorLHS = nullptr; ConstantInt XorRHS = nullptr;
	if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
	unsigned TySizeBits = Ty->getScalarSizeInBits();
	const APInt &RHSVal = CI->getValue();
	unsigned ExtendAmt = 0;
	// If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
	// If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
	if (XorRHS->getValue() == -RHSVal) {
	if (RHSVal.isPowerOf2())
	ExtendAmt = TySizeBits - RHSVal.logBase2() - 1;
	else if (XorRHS->getValue().isPowerOf2())
	ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1;
	}

	if (ExtendAmt) {
	APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
	if (!MaskedValueIsZero(XorLHS, Mask, 0, &I))
	ExtendAmt = 0;
	}

	if (ExtendAmt) {
	Constant *ShAmt = ConstantInt::get(Ty, ExtendAmt);
	Value *NewShl = Builder.CreateShl(XorLHS, ShAmt, "sext");
	return BinaryOperator::CreateAShr(NewShl, ShAmt);
	}

	// If this is a xor that was canonicalized from a sub, turn it back into
	// a sub and fuse this add with it.
	if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
	KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I);
	if ((XorRHS->getValue() \| LHSKnown.Zero).isAllOnesValue())
	return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
	XorLHS);
	}
	// (X + signmask) + C could have gotten canonicalized to (X^signmask) + C,
	// transform them into (X + (signmask ^ C))
	if (XorRHS->getValue().isSignMask())
	return BinaryOperator::CreateAdd(XorLHS,
	ConstantExpr::getXor(XorRHS, CI));
	}
	}

	if (Ty->isIntOrIntVectorTy(1))
	return BinaryOperator::CreateXor(LHS, RHS);

	// X + X --> X << 1
	if (LHS == RHS) {
	auto *Shl = BinaryOperator::CreateShl(LHS, ConstantInt::get(Ty, 1));
	Shl->setHasNoSignedWrap(I.hasNoSignedWrap());
	Shl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
	return Shl;
	}

	Value A, B;
	if (match(LHS, m_Neg(m_Value(A)))) {
	// -A + -B --> -(A + B)
	if (match(RHS, m_Neg(m_Value(B))))
	return BinaryOperator::CreateNeg(Builder.CreateAdd(A, B));

	// -A + B --> B - A
	return BinaryOperator::CreateSub(RHS, A);
	}

	// A + -B --> A - B
	if (match(RHS, m_Neg(m_Value(B))))
	return BinaryOperator::CreateSub(LHS, B);

	if (Value *V = checkForNegativeOperand(I, Builder))
	return replaceInstUsesWith(I, V);

	// (A + 1) + ~B --> A - B
	// ~B + (A + 1) --> A - B
	// (~B + A) + 1 --> A - B
	// (A + ~B) + 1 --> A - B
	if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))) \|\|
	match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One())))
	return BinaryOperator::CreateSub(A, B);

	// X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
	if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);

	// A+B --> A\|B iff A and B have no bits set in common.
	if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
	return BinaryOperator::CreateOr(LHS, RHS);

	// FIXME: We already did a check for ConstantInt RHS above this.
	// FIXME: Is this pattern covered by another fold? No regression tests fail on
	// removal.
	if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
	// (X & FF00) + xx00 -> (X+xx00) & FF00
	Value *X;
	ConstantInt *C2;
	if (LHS->hasOneUse() &&
	match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) &&
	CRHS->getValue() == (CRHS->getValue() & C2->getValue())) {
	// See if all bits from the first bit set in the Add RHS up are included
	// in the mask. First, get the rightmost bit.
	const APInt &AddRHSV = CRHS->getValue();

	// Form a mask of all bits from the lowest bit added through the top.
	APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));

	// See if the and mask includes all of these bits.
	APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());

	if (AddRHSHighBits == AddRHSHighBitsAnd) {
	// Okay, the xform is safe. Insert the new add pronto.
	Value *NewAdd = Builder.CreateAdd(X, CRHS, LHS->getName());
	return BinaryOperator::CreateAnd(NewAdd, C2);
	}
	}
	}

	// add (select X 0 (sub n A)) A --> select X A n
	{
	SelectInst *SI = dyn_cast<SelectInst>(LHS);
	Value *A = RHS;
	if (!SI) {
	SI = dyn_cast<SelectInst>(RHS);
	A = LHS;
	}
	if (SI && SI->hasOneUse()) {
	Value *TV = SI->getTrueValue();
	Value *FV = SI->getFalseValue();
	Value *N;

	// Can we fold the add into the argument of the select?
	// We check both true and false select arguments for a matching subtract.
	if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
	// Fold the add into the true select value.
	return SelectInst::Create(SI->getCondition(), N, A);

	if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
	// Fold the add into the false select value.
	return SelectInst::Create(SI->getCondition(), A, N);
	}
	}

	if (Instruction *Ext = narrowMathIfNoOverflow(I))
	return Ext;

	// (add (xor A, B) (and A, B)) --> (or A, B)
	// (add (and A, B) (xor A, B)) --> (or A, B)
	if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)),
	m_c_And(m_Deferred(A), m_Deferred(B)))))
	return BinaryOperator::CreateOr(A, B);

	// (add (or A, B) (and A, B)) --> (add A, B)
	// (add (and A, B) (or A, B)) --> (add A, B)
	if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
	m_c_And(m_Deferred(A), m_Deferred(B))))) {
	I.setOperand(0, A);
	I.setOperand(1, B);
	return &I;
	}

	// TODO(jingyue): Consider willNotOverflowSignedAdd and
	// willNotOverflowUnsignedAdd to reduce the number of invocations of
	// computeKnownBits.
	bool Changed = false;
	if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
	Changed = true;
	I.setHasNoSignedWrap(true);
	}
	if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
	Changed = true;
	I.setHasNoUnsignedWrap(true);
	}

	if (Instruction *V = canonicalizeLowbitMask(I, Builder))
	return V;

	if (Instruction *V =
	canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
	return V;

	if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
	return SatAdd;

	return Changed ? &I : nullptr;
	}

	/// Eliminate an op from a linear interpolation (lerp) pattern.
	static Instruction *factorizeLerp(BinaryOperator &I,
	InstCombiner::BuilderTy &Builder) {
	Value X, Y, *Z;
	if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y),
	m_OneUse(m_FSub(m_FPOne(),
	m_Value(Z))))),
	m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z))))))
	return nullptr;

	// (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants]
	Value *XY = Builder.CreateFSubFMF(X, Y, &I);
	Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I);
	return BinaryOperator::CreateFAddFMF(Y, MulZ, &I);
	}

	/// Factor a common operand out of fadd/fsub of fmul/fdiv.
	static Instruction *factorizeFAddFSub(BinaryOperator &I,
	InstCombiner::BuilderTy &Builder) {
	assert((I.getOpcode() == Instruction::FAdd \|\|
	I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub");
	assert(I.hasAllowReassoc() && I.hasNoSignedZeros() &&
	"FP factorization requires FMF");

	if (Instruction *Lerp = factorizeLerp(I, Builder))
	return Lerp;

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Value X, Y, *Z;
	bool IsFMul;
	if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) &&
	match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) \|\|
	(match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) &&
	match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))))
	IsFMul = true;
	else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) &&
	match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z)))))
	IsFMul = false;
	else
	return nullptr;

	// (X * Z) + (Y * Z) --> (X + Y) * Z
	// (X * Z) - (Y * Z) --> (X - Y) * Z
	// (X / Z) + (Y / Z) --> (X + Y) / Z
	// (X / Z) - (Y / Z) --> (X - Y) / Z
	bool IsFAdd = I.getOpcode() == Instruction::FAdd;
	Value *XY = IsFAdd ? Builder.CreateFAddFMF(X, Y, &I)
	: Builder.CreateFSubFMF(X, Y, &I);

	// Bail out if we just created a denormal constant.
	// TODO: This is copied from a previous implementation. Is it necessary?
	const APFloat *C;
	if (match(XY, m_APFloat(C)) && !C->isNormal())
	return nullptr;

	return IsFMul ? BinaryOperator::CreateFMulFMF(XY, Z, &I)
	: BinaryOperator::CreateFDivFMF(XY, Z, &I);
	}

	Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
	if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
	I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (SimplifyAssociativeOrCommutative(I))
	return &I;

	if (Instruction *X = foldVectorBinop(I))
	return X;

	if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
	return FoldedFAdd;

	// (-X) + Y --> Y - X
	Value X, Y;
	if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
	return BinaryOperator::CreateFSubFMF(Y, X, &I);

	// Similar to above, but look through fmul/fdiv for the negated term.
	// (-X * Y) + Z --> Z - (X * Y) [4 commuted variants]
	Value *Z;
	if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))),
	m_Value(Z)))) {
	Value *XY = Builder.CreateFMulFMF(X, Y, &I);
	return BinaryOperator::CreateFSubFMF(Z, XY, &I);
	}
	// (-X / Y) + Z --> Z - (X / Y) [2 commuted variants]
	// (X / -Y) + Z --> Z - (X / Y) [2 commuted variants]
	if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))),
	m_Value(Z))) \|\|
	match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))),
	m_Value(Z)))) {
	Value *XY = Builder.CreateFDivFMF(X, Y, &I);
	return BinaryOperator::CreateFSubFMF(Z, XY, &I);
	}

	// Check for (fadd double (sitofp x), y), see if we can merge this into an
	// integer add followed by a promotion.
	Value LHS = I.getOperand(0), RHS = I.getOperand(1);
	if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
	Value *LHSIntVal = LHSConv->getOperand(0);
	Type *FPType = LHSConv->getType();

	// TODO: This check is overly conservative. In many cases known bits
	// analysis can tell us that the result of the addition has less significant
	// bits than the integer type can hold.
	auto IsValidPromotion = [](Type FTy, Type ITy) {
	Type *FScalarTy = FTy->getScalarType();
	Type *IScalarTy = ITy->getScalarType();

	// Do we have enough bits in the significand to represent the result of
	// the integer addition?
	unsigned MaxRepresentableBits =
	APFloat::semanticsPrecision(FScalarTy->getFltSemantics());
	return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits;
	};

	// (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
	// ... if the constant fits in the integer value. This is useful for things
	// like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
	// requires a constant pool load, and generally allows the add to be better
	// instcombined.
	if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
	if (IsValidPromotion(FPType, LHSIntVal->getType())) {
	Constant *CI =
	ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
	if (LHSConv->hasOneUse() &&
	ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
	willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
	// Insert the new integer add.
	Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");
	return new SIToFPInst(NewAdd, I.getType());
	}
	}

	// (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
	if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
	Value *RHSIntVal = RHSConv->getOperand(0);
	// It's enough to check LHS types only because we require int types to
	// be the same for this transform.
	if (IsValidPromotion(FPType, LHSIntVal->getType())) {
	// Only do this if x/y have the same type, if at least one of them has a
	// single use (so we don't increase the number of int->fp conversions),
	// and if the integer add will not overflow.
	if (LHSIntVal->getType() == RHSIntVal->getType() &&
	(LHSConv->hasOneUse() \|\| RHSConv->hasOneUse()) &&
	willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
	// Insert the new integer add.
	Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv");
	return new SIToFPInst(NewAdd, I.getType());
	}
	}
	}
	}

	// Handle specials cases for FAdd with selects feeding the operation
	if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
	return replaceInstUsesWith(I, V);

	if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
	if (Instruction *F = factorizeFAddFSub(I, Builder))
	return F;
	if (Value *V = FAddCombine(Builder).simplify(&I))
	return replaceInstUsesWith(I, V);
	}

	return nullptr;
	}

	/// Optimize pointer differences into the same array into a size. Consider:
	/// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer
	/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
	Value InstCombiner::OptimizePointerDifference(Value LHS, Value *RHS,
	Type *Ty, bool IsNUW) {
	// If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
	// this.
	bool Swapped = false;
	GEPOperator GEP1 = nullptr, GEP2 = nullptr;

	// For now we require one side to be the base pointer "A" or a constant
	// GEP derived from it.
	if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
	// (gep X, ...) - X
	if (LHSGEP->getOperand(0) == RHS) {
	GEP1 = LHSGEP;
	Swapped = false;
	} else if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
	// (gep X, ...) - (gep X, ...)
	if (LHSGEP->getOperand(0)->stripPointerCasts() ==
	RHSGEP->getOperand(0)->stripPointerCasts()) {
	GEP2 = RHSGEP;
	GEP1 = LHSGEP;
	Swapped = false;
	}
	}
	}

	if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
	// X - (gep X, ...)
	if (RHSGEP->getOperand(0) == LHS) {
	GEP1 = RHSGEP;
	Swapped = true;
	} else if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
	// (gep X, ...) - (gep X, ...)
	if (RHSGEP->getOperand(0)->stripPointerCasts() ==
	LHSGEP->getOperand(0)->stripPointerCasts()) {
	GEP2 = LHSGEP;
	GEP1 = RHSGEP;
	Swapped = true;
	}
	}
	}

	if (!GEP1)
	// No GEP found.
	return nullptr;

	if (GEP2) {
	// (gep X, ...) - (gep X, ...)
	//
	// Avoid duplicating the arithmetic if there are more than one non-constant
	// indices between the two GEPs and either GEP has a non-constant index and
	// multiple users. If zero non-constant index, the result is a constant and
	// there is no duplication. If one non-constant index, the result is an add
	// or sub with a constant, which is no larger than the original code, and
	// there's no duplicated arithmetic, even if either GEP has multiple
	// users. If more than one non-constant indices combined, as long as the GEP
	// with at least one non-constant index doesn't have multiple users, there
	// is no duplication.
	unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
	unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
	if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
	((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) \|\|
	(NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
	return nullptr;
	}
	}

	// Emit the offset of the GEP and an intptr_t.
	Value *Result = EmitGEPOffset(GEP1);

	// If this is a single inbounds GEP and the original sub was nuw,
	// then the final multiplication is also nuw. We match an extra add zero
	// here, because that's what EmitGEPOffset() generates.
	Instruction *I;
	if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
	match(Result, m_Add(m_Instruction(I), m_Zero())) &&
	I->getOpcode() == Instruction::Mul)
	I->setHasNoUnsignedWrap();

	// If we had a constant expression GEP on the other side offsetting the
	// pointer, subtract it from the offset we have.
	if (GEP2) {
	Value *Offset = EmitGEPOffset(GEP2);
	Result = Builder.CreateSub(Result, Offset);
	}

	// If we have p - gep(p, ...) then we have to negate the result.
	if (Swapped)
	Result = Builder.CreateNeg(Result, "diff.neg");

	return Builder.CreateIntCast(Result, Ty, true);
	}

	Instruction *InstCombiner::visitSub(BinaryOperator &I) {
	if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
	I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	// (AB)-(AC) -> A*(B-C) etc
	if (Value *V = SimplifyUsingDistributiveLaws(I))
	return replaceInstUsesWith(I, V);

	// If this is a 'B = x-(-A)', change to B = x+A.
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	if (Value *V = dyn_castNegVal(Op1)) {
	BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);

	if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
	assert(BO->getOpcode() == Instruction::Sub &&
	"Expected a subtraction operator!");
	if (BO->hasNoSignedWrap() && I.hasNoSignedWrap())
	Res->setHasNoSignedWrap(true);
	} else {
	if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap())
	Res->setHasNoSignedWrap(true);
	}

	return Res;
	}

	if (I.getType()->isIntOrIntVectorTy(1))
	return BinaryOperator::CreateXor(Op0, Op1);

	// Replace (-1 - A) with (~A).
	if (match(Op0, m_AllOnes()))
	return BinaryOperator::CreateNot(Op1);

	// (~X) - (~Y) --> Y - X
	Value X, Y;
	if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
	return BinaryOperator::CreateSub(Y, X);

	// (X + -1) - Y --> ~Y + X
	if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
	return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);

	// Y - (X + 1) --> ~X + Y
	if (match(Op1, m_OneUse(m_Add(m_Value(X), m_One()))))
	return BinaryOperator::CreateAdd(Builder.CreateNot(X), Op0);

	// Y - ~X --> (X + 1) + Y
	if (match(Op1, m_OneUse(m_Not(m_Value(X))))) {
	return BinaryOperator::CreateAdd(
	Builder.CreateAdd(Op0, ConstantInt::get(I.getType(), 1)), X);
	}

	if (Constant *C = dyn_cast<Constant>(Op0)) {
	bool IsNegate = match(C, m_ZeroInt());
	Value *X;
	if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
	// 0 - (zext bool) --> sext bool
	// C - (zext bool) --> bool ? C - 1 : C
	if (IsNegate)
	return CastInst::CreateSExtOrBitCast(X, I.getType());
	return SelectInst::Create(X, SubOne(C), C);
	}
	if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
	// 0 - (sext bool) --> zext bool
	// C - (sext bool) --> bool ? C + 1 : C
	if (IsNegate)
	return CastInst::CreateZExtOrBitCast(X, I.getType());
	return SelectInst::Create(X, AddOne(C), C);
	}

	// C - ~X == X + (1+C)
	if (match(Op1, m_Not(m_Value(X))))
	return BinaryOperator::CreateAdd(X, AddOne(C));

	// Try to fold constant sub into select arguments.
	if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
	if (Instruction *R = FoldOpIntoSelect(I, SI))
	return R;

	// Try to fold constant sub into PHI values.
	if (PHINode *PN = dyn_cast<PHINode>(Op1))
	if (Instruction *R = foldOpIntoPhi(I, PN))
	return R;

	Constant *C2;

	// C-(C2-X) --> X+(C-C2)
	- if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))))
	+ if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))) && !isa<ConstantExpr>(C2))
	return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));

	// C-(X+C2) --> (C-C2)-X
	if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
	return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
	}

	const APInt *Op0C;
	if (match(Op0, m_APInt(Op0C))) {

	if (Op0C->isNullValue()) {
	Value *Op1Wide;
	match(Op1, m_TruncOrSelf(m_Value(Op1Wide)));
	bool HadTrunc = Op1Wide != Op1;
	bool NoTruncOrTruncIsOneUse = !HadTrunc \|\| Op1->hasOneUse();
	unsigned BitWidth = Op1Wide->getType()->getScalarSizeInBits();

	Value *X;
	const APInt *ShAmt;
	// -(X >>u 31) -> (X >>s 31)
	if (NoTruncOrTruncIsOneUse &&
	match(Op1Wide, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
	*ShAmt == BitWidth - 1) {
	Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
	Instruction *NewShift = BinaryOperator::CreateAShr(X, ShAmtOp);
	NewShift->copyIRFlags(Op1Wide);
	if (!HadTrunc)
	return NewShift;
	Builder.Insert(NewShift);
	return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
	}
	// -(X >>s 31) -> (X >>u 31)
	if (NoTruncOrTruncIsOneUse &&
	match(Op1Wide, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
	*ShAmt == BitWidth - 1) {
	Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
	Instruction *NewShift = BinaryOperator::CreateLShr(X, ShAmtOp);
	NewShift->copyIRFlags(Op1Wide);
	if (!HadTrunc)
	return NewShift;
	Builder.Insert(NewShift);
	return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
	}

	if (!HadTrunc && Op1->hasOneUse()) {
	Value LHS, RHS;
	SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor;
	if (SPF == SPF_ABS \|\| SPF == SPF_NABS) {
	// This is a negate of an ABS/NABS pattern. Just swap the operands
	// of the select.
	cast<SelectInst>(Op1)->swapValues();
	// Don't swap prof metadata, we didn't change the branch behavior.
	return replaceInstUsesWith(I, Op1);
	}
	}
	}

	// Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
	// zero.
	if (Op0C->isMask()) {
	KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
	if ((*Op0C \| RHSKnown.Zero).isAllOnesValue())
	return BinaryOperator::CreateXor(Op1, Op0);
	}
	}

	{
	Value *Y;
	// X-(X+Y) == -Y X-(Y+X) == -Y
	if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
	return BinaryOperator::CreateNeg(Y);

	// (X-Y)-X == -Y
	if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
	return BinaryOperator::CreateNeg(Y);
	}

	// (sub (or A, B) (and A, B)) --> (xor A, B)
	{
	Value A, B;
	if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
	match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
	return BinaryOperator::CreateXor(A, B);
	}

	// (sub (and A, B) (or A, B)) --> neg (xor A, B)
	{
	Value A, B;
	if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
	match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse()))
	return BinaryOperator::CreateNeg(Builder.CreateXor(A, B));
	}

	// (sub (or A, B), (xor A, B)) --> (and A, B)
	{
	Value A, B;
	if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
	match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
	return BinaryOperator::CreateAnd(A, B);
	}

	// (sub (xor A, B) (or A, B)) --> neg (and A, B)
	{
	Value A, B;
	if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
	match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse()))
	return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B));
	}

	{
	Value *Y;
	// ((X \| Y) - X) --> (~X & Y)
	if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
	return BinaryOperator::CreateAnd(
	Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
	}

	{
	// (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1))
	Value *X;
	if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1),
	m_OneUse(m_Neg(m_Value(X))))))) {
	return BinaryOperator::CreateNeg(Builder.CreateAnd(
	Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType()))));
	}
	}

	{
	// (sub (and Op1, C), Op1) --> neg (and Op1, ~C)
	Constant *C;
	if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) {
	return BinaryOperator::CreateNeg(
	Builder.CreateAnd(Op1, Builder.CreateNot(C)));
	}
	}

	{
	// If we have a subtraction between some value and a select between
	// said value and something else, sink subtraction into select hands, i.e.:
	// sub (select %Cond, %TrueVal, %FalseVal), %Op1
	// ->
	// select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1)
	// or
	// sub %Op0, (select %Cond, %TrueVal, %FalseVal)
	// ->
	// select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal)
	// This will result in select between new subtraction and 0.
	auto SinkSubIntoSelect =
	[Ty = I.getType()](Value Select, Value OtherHandOfSub,
	auto SubBuilder) -> Instruction * {
	Value Cond, TrueVal, *FalseVal;
	if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal),
	m_Value(FalseVal)))))
	return nullptr;
	if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal)
	return nullptr;
	// While it is really tempting to just create two subtractions and let
	// InstCombine fold one of those to 0, it isn't possible to do so
	// because of worklist visitation order. So ugly it is.
	bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal;
	Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal);
	Constant *Zero = Constant::getNullValue(Ty);
	SelectInst *NewSel =
	SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub,
	OtherHandOfSubIsTrueVal ? NewSub : Zero);
	// Preserve prof metadata if any.
	NewSel->copyMetadata(cast<Instruction>(*Select));
	return NewSel;
	};
	if (Instruction *NewSel = SinkSubIntoSelect(
	/Select=/Op0, /OtherHandOfSub=/Op1,
	[Builder = &Builder, Op1](Value *OtherHandOfSelect) {
	return Builder->CreateSub(OtherHandOfSelect,
	/OtherHandOfSub=/Op1);
	}))
	return NewSel;
	if (Instruction *NewSel = SinkSubIntoSelect(
	/Select=/Op1, /OtherHandOfSub=/Op0,
	[Builder = &Builder, Op0](Value *OtherHandOfSelect) {
	return Builder->CreateSub(/OtherHandOfSub=/Op0,
	OtherHandOfSelect);
	}))
	return NewSel;
	}

	if (Op1->hasOneUse()) {
	Value X = nullptr, Y = nullptr, *Z = nullptr;
	Constant *C = nullptr;

	// (X - (Y - Z)) --> (X + (Z - Y)).
	if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
	return BinaryOperator::CreateAdd(Op0,
	Builder.CreateSub(Z, Y, Op1->getName()));

	// (X - (X & Y)) --> (X & ~Y)
	if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0))))
	return BinaryOperator::CreateAnd(Op0,
	Builder.CreateNot(Y, Y->getName() + ".not"));

	// 0 - (X sdiv C) -> (X sdiv -C) provided the negation doesn't overflow.
	if (match(Op0, m_Zero())) {
	Constant *Op11C;
	if (match(Op1, m_SDiv(m_Value(X), m_Constant(Op11C))) &&
	!Op11C->containsUndefElement() && Op11C->isNotMinSignedValue() &&
	Op11C->isNotOneValue()) {
	Instruction *BO =
	BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(Op11C));
	BO->setIsExact(cast<BinaryOperator>(Op1)->isExact());
	return BO;
	}
	}

	// 0 - (X << Y) -> (-X << Y) when X is freely negatable.
	if (match(Op1, m_Shl(m_Value(X), m_Value(Y))) && match(Op0, m_Zero()))
	if (Value *XNeg = dyn_castNegVal(X))
	return BinaryOperator::CreateShl(XNeg, Y);

	// Subtracting -1/0 is the same as adding 1/0:
	// sub [nsw] Op0, sext(bool Y) -> add [nsw] Op0, zext(bool Y)
	// 'nuw' is dropped in favor of the canonical form.
	if (match(Op1, m_SExt(m_Value(Y))) &&
	Y->getType()->getScalarSizeInBits() == 1) {
	Value *Zext = Builder.CreateZExt(Y, I.getType());
	BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Zext);
	Add->setHasNoSignedWrap(I.hasNoSignedWrap());
	return Add;
	}
	// sub [nsw] X, zext(bool Y) -> add [nsw] X, sext(bool Y)
	// 'nuw' is dropped in favor of the canonical form.
	if (match(Op1, m_ZExt(m_Value(Y))) && Y->getType()->isIntOrIntVectorTy(1)) {
	Value *Sext = Builder.CreateSExt(Y, I.getType());
	BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Sext);
	Add->setHasNoSignedWrap(I.hasNoSignedWrap());
	return Add;
	}

	// X - A-B -> X + AB
	// X - -AB -> X + AB
	Value A, B;
	if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B)))))
	return BinaryOperator::CreateAdd(Op0, Builder.CreateMul(A, B));

	// X - AC -> X + A-C
	// No need to handle commuted multiply because multiply handling will
	// ensure constant will be move to the right hand side.
	if (match(Op1, m_Mul(m_Value(A), m_Constant(C))) && !isa<ConstantExpr>(C)) {
	Value *NewMul = Builder.CreateMul(A, ConstantExpr::getNeg(C));
	return BinaryOperator::CreateAdd(Op0, NewMul);
	}
	}

	{
	// ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A
	// ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A
	// Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O)
	// Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O)
	// So long as O here is freely invertible, this will be neutral or a win.
	Value LHS, RHS, *A;
	Value NotA = Op0, MinMax = Op1;
	SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
	if (!SelectPatternResult::isMinOrMax(SPF)) {
	NotA = Op1;
	MinMax = Op0;
	SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
	}
	if (SelectPatternResult::isMinOrMax(SPF) &&
	match(NotA, m_Not(m_Value(A))) && (NotA == LHS \|\| NotA == RHS)) {
	if (NotA == LHS)
	std::swap(LHS, RHS);
	// LHS is now O above and expected to have at least 2 uses (the min/max)
	// NotA is epected to have 2 uses from the min/max and 1 from the sub.
	if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
	!NotA->hasNUsesOrMore(4)) {
	// Note: We don't generate the inverse max/min, just create the not of
	// it and let other folds do the rest.
	Value *Not = Builder.CreateNot(MinMax);
	if (NotA == Op0)
	return BinaryOperator::CreateSub(Not, A);
	else
	return BinaryOperator::CreateSub(A, Not);
	}
	}
	}

	// Optimize pointer differences into the same array into a size. Consider:
	// &A[10] - &A[0]: we should compile this to "10".
	Value LHSOp, RHSOp;
	if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
	match(Op1, m_PtrToInt(m_Value(RHSOp))))
	if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
	I.hasNoUnsignedWrap()))
	return replaceInstUsesWith(I, Res);

	// trunc(p)-trunc(q) -> trunc(p-q)
	if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
	match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
	if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
	/* IsNUW */ false))
	return replaceInstUsesWith(I, Res);

	// Canonicalize a shifty way to code absolute value to the common pattern.
	// There are 2 potential commuted variants.
	// We're relying on the fact that we only do this transform when the shift has
	// exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase
	// instructions).
	Value *A;
	const APInt *ShAmt;
	Type *Ty = I.getType();
	if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
	Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
	match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) {
	// B = ashr i32 A, 31 ; smear the sign bit
	// sub (xor A, B), B ; flip bits if negative and subtract -1 (add 1)
	// --> (A < 0) ? -A : A
	Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
	// Copy the nuw/nsw flags from the sub to the negate.
	Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
	I.hasNoSignedWrap());
	return SelectInst::Create(Cmp, Neg, A);
	}

	if (Instruction *V =
	canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
	return V;

	if (Instruction *Ext = narrowMathIfNoOverflow(I))
	return Ext;

	bool Changed = false;
	if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
	Changed = true;
	I.setHasNoSignedWrap(true);
	}
	if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
	Changed = true;
	I.setHasNoUnsignedWrap(true);
	}

	return Changed ? &I : nullptr;
	}

	/// This eliminates floating-point negation in either 'fneg(X)' or
	/// 'fsub(-0.0, X)' form by combining into a constant operand.
	static Instruction *foldFNegIntoConstant(Instruction &I) {
	Value *X;
	Constant *C;

	// Fold negation into constant operand. This is limited with one-use because
	// fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv.
	// -(X * C) --> X * (-C)
	// FIXME: It's arguable whether these should be m_OneUse or not. The current
	// belief is that the FNeg allows for better reassociation opportunities.
	if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C))))))
	return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
	// -(X / C) --> X / (-C)
	if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C))))))
	return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
	// -(C / X) --> (-C) / X
	if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
	return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);

	return nullptr;
	}

	static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
	InstCombiner::BuilderTy &Builder) {
	Value *FNeg;
	if (!match(&I, m_FNeg(m_Value(FNeg))))
	return nullptr;

	Value X, Y;
	if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y)))))
	return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I);

	if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))))
	return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I);

	return nullptr;
	}

	Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
	Value *Op = I.getOperand(0);

	if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldFNegIntoConstant(I))
	return X;

	Value X, Y;

	// If we can ignore the sign of zeros: -(X - Y) --> (Y - X)
	if (I.hasNoSignedZeros() &&
	match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y)))))
	return BinaryOperator::CreateFSubFMF(Y, X, &I);

	if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
	return R;

	return nullptr;
	}

	Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
	if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
	I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	// Subtraction from -0.0 is the canonical form of fneg.
	// fsub nsz 0, X ==> fsub nsz -0.0, X
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	if (I.hasNoSignedZeros() && match(Op0, m_PosZeroFP()))
	return BinaryOperator::CreateFNegFMF(Op1, &I);

	if (Instruction *X = foldFNegIntoConstant(I))
	return X;

	if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
	return R;

	Value X, Y;
	Constant *C;

	// If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
	// Canonicalize to fadd to make analysis easier.
	// This can also help codegen because fadd is commutative.
	// Note that if this fsub was really an fneg, the fadd with -0.0 will get
	// killed later. We still limit that particular transform with 'hasOneUse'
	// because an fneg is assumed better/cheaper than a generic fsub.
	if (I.hasNoSignedZeros() \|\| CannotBeNegativeZero(Op0, SQ.TLI)) {
	if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
	Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
	return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
	}
	}

	if (isa<Constant>(Op0))
	if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
	if (Instruction *NV = FoldOpIntoSelect(I, SI))
	return NV;

	// X - C --> X + (-C)
	// But don't transform constant expressions because there's an inverse fold
	// for X + (-Y) --> X - Y.
	if (match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1))
	return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);

	// X - (-Y) --> X + Y
	if (match(Op1, m_FNeg(m_Value(Y))))
	return BinaryOperator::CreateFAddFMF(Op0, Y, &I);

	// Similar to above, but look through a cast of the negated value:
	// X - (fptrunc(-Y)) --> X + fptrunc(Y)
	Type *Ty = I.getType();
	if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y))))))
	return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I);

	// X - (fpext(-Y)) --> X + fpext(Y)
	if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
	return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);

	// Similar to above, but look through fmul/fdiv of the negated value:
	// Op0 - (-X * Y) --> Op0 + (X * Y)
	// Op0 - (Y * -X) --> Op0 + (X * Y)
	if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) {
	Value *FMul = Builder.CreateFMulFMF(X, Y, &I);
	return BinaryOperator::CreateFAddFMF(Op0, FMul, &I);
	}
	// Op0 - (-X / Y) --> Op0 + (X / Y)
	// Op0 - (X / -Y) --> Op0 + (X / Y)
	if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) \|\|
	match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) {
	Value *FDiv = Builder.CreateFDivFMF(X, Y, &I);
	return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I);
	}

	// Handle special cases for FSub with selects feeding the operation
	if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
	return replaceInstUsesWith(I, V);

	if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
	// (Y - X) - Y --> -X
	if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
	return BinaryOperator::CreateFNegFMF(X, &I);

	// Y - (X + Y) --> -X
	// Y - (Y + X) --> -X
	if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
	return BinaryOperator::CreateFNegFMF(X, &I);

	// (X * C) - X --> X * (C - 1.0)
	if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
	Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0));
	return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I);
	}
	// X - (X * C) --> X * (1.0 - C)
	if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) {
	Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C);
	return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
	}

	if (Instruction *F = factorizeFAddFSub(I, Builder))
	return F;

	// TODO: This performs reassociative folds for FP ops. Some fraction of the
	// functionality has been subsumed by simple pattern matching here and in
	// InstSimplify. We should let a dedicated reassociation pass handle more
	// complex pattern matching and remove this from InstCombine.
	if (Value *V = FAddCombine(Builder).simplify(&I))
	return replaceInstUsesWith(I, V);
	}

	return nullptr;
	}
	Index: head/contrib/llvm-project/llvm
	===================================================================
	--- head/contrib/llvm-project/llvm (revision 362608)
	+++ head/contrib/llvm-project/llvm (revision 362609)

	Property changes on: head/contrib/llvm-project/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm-project/release-10.x/llvm:r362444-362594
	Index: head/contrib/llvm-project
	===================================================================
	--- head/contrib/llvm-project (revision 362608)
	+++ head/contrib/llvm-project (revision 362609)

	Property changes on: head/contrib/llvm-project
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm-project/release-10.x:r362444-362594
	Index: head/lib/clang/include/VCSVersion.inc
	===================================================================
	--- head/lib/clang/include/VCSVersion.inc (revision 362608)
	+++ head/lib/clang/include/VCSVersion.inc (revision 362609)
	@@ -1,14 +1,14 @@
	// $FreeBSD$

	-#define LLVM_REVISION "llvmorg-10.0.0-97-g6f71678ecd2"
	+#define LLVM_REVISION "llvmorg-10.0.0-129-gd24d5c8e308"
	#define LLVM_REPOSITORY "git@github.com:llvm/llvm-project.git"

	-#define CLANG_REVISION "llvmorg-10.0.0-97-g6f71678ecd2"
	+#define CLANG_REVISION "llvmorg-10.0.0-129-gd24d5c8e308"
	#define CLANG_REPOSITORY "git@github.com:llvm/llvm-project.git"

	// <Upstream revision at import>-<Local identifier in __FreeBSD_version style>
	-#define LLD_REVISION "llvmorg-10.0.0-97-g6f71678ecd2-1300007"
	+#define LLD_REVISION "llvmorg-10.0.0-129-gd24d5c8e308-1300007"
	#define LLD_REPOSITORY "FreeBSD"

	-#define LLDB_REVISION "llvmorg-10.0.0-97-g6f71678ecd2"
	+#define LLDB_REVISION "llvmorg-10.0.0-129-gd24d5c8e308"
	#define LLDB_REPOSITORY "git@github.com:llvm/llvm-project.git"
	Index: head/lib/clang/include/llvm/Support/VCSRevision.h
	===================================================================
	--- head/lib/clang/include/llvm/Support/VCSRevision.h (revision 362608)
	+++ head/lib/clang/include/llvm/Support/VCSRevision.h (revision 362609)
	@@ -1,3 +1,3 @@
	/* $FreeBSD$ */
	-#define LLVM_REVISION "llvmorg-10.0.0-97-g6f71678ecd2"
	+#define LLVM_REVISION "llvmorg-10.0.0-129-gd24d5c8e308"
	#define LLVM_REPOSITORY "git@github.com:llvm/llvm-project.git"
	Index: head/lib/clang/libllvm/Makefile
	===================================================================
	--- head/lib/clang/libllvm/Makefile (revision 362608)
	+++ head/lib/clang/libllvm/Makefile (revision 362609)
	@@ -1,1805 +1,1810 @@
	# $FreeBSD$

	.include <src.opts.mk>
	.include "../llvm.pre.mk"

	LIB= llvm
	INTERNALLIB=

	CFLAGS+= -I${.OBJDIR}

	.if ${MK_LLVM_TARGET_AARCH64} == "no" && ${MK_LLVM_TARGET_ARM} == "no" && \
	${MK_LLVM_TARGET_BPF} == "no" && ${MK_LLVM_TARGET_MIPS} == "no" && \
	${MK_LLVM_TARGET_POWERPC} == "no" && ${MK_LLVM_TARGET_RISCV} == "no" && \
	${MK_LLVM_TARGET_X86} == "no"
	.error Please enable at least one of: MK_LLVM_TARGET_AARCH64,\
	MK_LLVM_TARGET_ARM, MK_LLVM_TARGET_BPF, MK_LLVM_TARGET_MIPS, \
	MK_LLVM_TARGET_POWERPC, MK_LLVM_TARGET_RISCV, or MK_LLVM_TARGET_X86
	.endif

	.for arch in AArch64 ARM BPF Mips PowerPC RISCV X86
	. if ${MK_LLVM_TARGET_${arch:tu}} != "no"
	CFLAGS+= -I${LLVM_SRCS}/lib/Target/${arch}
	. endif
	.endfor

	SRCDIR= llvm/lib

	# Explanation of different SRCS variants below:
	# SRCS_MIN: always required, even for bootstrap
	# SRCS_MIW: required for world stage (after cross-tools)
	# SRCS_EXT: required for MK_CLANG_EXTRAS
	# SRCS_EXL: required for MK_CLANG_EXTRAS and MK_LLD
	# SRCS_FUL: required for MK_CLANG_FULL
	# SRCS_LLD: required for MK_LLD
	# SRCS_XDB: required for MK_CLANG_EXTRAS and MK_LLDB
	# SRCS_XDL: required for MK_CLANG_EXTRAS, MK_LLD and MK_LLDB
	# SRCS_XDW: required for MK_CLANG_EXTRAS and MK_LLDB in world stage

	SRCS_MIN+= Analysis/AliasAnalysis.cpp
	SRCS_MIN+= Analysis/AliasAnalysisEvaluator.cpp
	SRCS_MIN+= Analysis/AliasAnalysisSummary.cpp
	SRCS_MIN+= Analysis/AliasSetTracker.cpp
	SRCS_EXT+= Analysis/Analysis.cpp
	SRCS_MIN+= Analysis/AssumptionCache.cpp
	SRCS_MIN+= Analysis/BasicAliasAnalysis.cpp
	SRCS_MIN+= Analysis/BlockFrequencyInfo.cpp
	SRCS_MIN+= Analysis/BlockFrequencyInfoImpl.cpp
	SRCS_MIN+= Analysis/BranchProbabilityInfo.cpp
	SRCS_MIN+= Analysis/CFG.cpp
	SRCS_MIN+= Analysis/CFGPrinter.cpp
	SRCS_MIN+= Analysis/CFLAndersAliasAnalysis.cpp
	SRCS_MIN+= Analysis/CFLSteensAliasAnalysis.cpp
	SRCS_MIN+= Analysis/CGSCCPassManager.cpp
	SRCS_MIN+= Analysis/CallGraph.cpp
	SRCS_MIN+= Analysis/CallGraphSCCPass.cpp
	SRCS_MIN+= Analysis/CallPrinter.cpp
	SRCS_MIN+= Analysis/CaptureTracking.cpp
	SRCS_MIN+= Analysis/CmpInstAnalysis.cpp
	SRCS_MIN+= Analysis/CodeMetrics.cpp
	SRCS_MIN+= Analysis/ConstantFolding.cpp
	SRCS_MIN+= Analysis/CostModel.cpp
	SRCS_MIN+= Analysis/DDG.cpp
	SRCS_MIN+= Analysis/Delinearization.cpp
	SRCS_MIN+= Analysis/DemandedBits.cpp
	SRCS_MIN+= Analysis/DependenceAnalysis.cpp
	SRCS_MIN+= Analysis/DependenceGraphBuilder.cpp
	SRCS_MIN+= Analysis/DivergenceAnalysis.cpp
	SRCS_MIN+= Analysis/DomPrinter.cpp
	SRCS_MIN+= Analysis/DomTreeUpdater.cpp
	SRCS_MIN+= Analysis/DominanceFrontier.cpp
	SRCS_MIN+= Analysis/EHPersonalities.cpp
	SRCS_MIN+= Analysis/GlobalsModRef.cpp
	SRCS_MIN+= Analysis/GuardUtils.cpp
	SRCS_MIN+= Analysis/IVDescriptors.cpp
	SRCS_MIN+= Analysis/IVUsers.cpp
	SRCS_MIN+= Analysis/IndirectCallPromotionAnalysis.cpp
	SRCS_MIN+= Analysis/InlineCost.cpp
	SRCS_MIN+= Analysis/InstCount.cpp
	SRCS_MIN+= Analysis/InstructionPrecedenceTracking.cpp
	SRCS_MIN+= Analysis/InstructionSimplify.cpp
	SRCS_MIN+= Analysis/Interval.cpp
	SRCS_MIN+= Analysis/IntervalPartition.cpp
	SRCS_MIN+= Analysis/LazyBlockFrequencyInfo.cpp
	SRCS_MIN+= Analysis/LazyBranchProbabilityInfo.cpp
	SRCS_MIN+= Analysis/LazyCallGraph.cpp
	SRCS_MIN+= Analysis/LazyValueInfo.cpp
	SRCS_MIN+= Analysis/LegacyDivergenceAnalysis.cpp
	SRCS_MIN+= Analysis/Lint.cpp
	SRCS_MIN+= Analysis/Loads.cpp
	SRCS_MIN+= Analysis/LoopAccessAnalysis.cpp
	SRCS_MIN+= Analysis/LoopAnalysisManager.cpp
	SRCS_MIN+= Analysis/LoopCacheAnalysis.cpp
	SRCS_MIN+= Analysis/LoopInfo.cpp
	SRCS_MIN+= Analysis/LoopPass.cpp
	SRCS_MIN+= Analysis/LoopUnrollAnalyzer.cpp
	SRCS_MIN+= Analysis/MemDepPrinter.cpp
	SRCS_MIN+= Analysis/MemDerefPrinter.cpp
	SRCS_MIN+= Analysis/MemoryBuiltins.cpp
	SRCS_MIN+= Analysis/MemoryDependenceAnalysis.cpp
	SRCS_MIN+= Analysis/MemoryLocation.cpp
	SRCS_MIN+= Analysis/MemorySSA.cpp
	SRCS_MIN+= Analysis/MemorySSAUpdater.cpp
	SRCS_MIN+= Analysis/ModuleDebugInfoPrinter.cpp
	SRCS_MIN+= Analysis/ModuleSummaryAnalysis.cpp
	SRCS_MIN+= Analysis/MustExecute.cpp
	SRCS_MIN+= Analysis/ObjCARCAliasAnalysis.cpp
	SRCS_MIN+= Analysis/ObjCARCAnalysisUtils.cpp
	SRCS_MIN+= Analysis/ObjCARCInstKind.cpp
	SRCS_MIN+= Analysis/OptimizationRemarkEmitter.cpp
	SRCS_MIN+= Analysis/OrderedBasicBlock.cpp
	SRCS_MIN+= Analysis/OrderedInstructions.cpp
	SRCS_MIN+= Analysis/PHITransAddr.cpp
	SRCS_MIN+= Analysis/PhiValues.cpp
	SRCS_MIN+= Analysis/PostDominators.cpp
	SRCS_MIN+= Analysis/ProfileSummaryInfo.cpp
	SRCS_MIN+= Analysis/PtrUseVisitor.cpp
	SRCS_MIN+= Analysis/RegionInfo.cpp
	SRCS_MIN+= Analysis/RegionPass.cpp
	SRCS_MIN+= Analysis/RegionPrinter.cpp
	SRCS_MIN+= Analysis/ScalarEvolution.cpp
	SRCS_MIN+= Analysis/ScalarEvolutionAliasAnalysis.cpp
	SRCS_MIN+= Analysis/ScalarEvolutionExpander.cpp
	SRCS_MIN+= Analysis/ScalarEvolutionNormalization.cpp
	SRCS_MIN+= Analysis/ScopedNoAliasAA.cpp
	SRCS_MIN+= Analysis/StackSafetyAnalysis.cpp
	SRCS_MIN+= Analysis/SyncDependenceAnalysis.cpp
	SRCS_MIN+= Analysis/SyntheticCountsUtils.cpp
	SRCS_MIN+= Analysis/TargetLibraryInfo.cpp
	SRCS_MIN+= Analysis/TargetTransformInfo.cpp
	SRCS_MIN+= Analysis/TypeBasedAliasAnalysis.cpp
	SRCS_MIN+= Analysis/TypeMetadataUtils.cpp
	SRCS_MIN+= Analysis/VFABIDemangling.cpp
	SRCS_MIN+= Analysis/ValueLattice.cpp
	SRCS_MIN+= Analysis/ValueLatticeUtils.cpp
	SRCS_MIN+= Analysis/ValueTracking.cpp
	SRCS_MIN+= Analysis/VectorUtils.cpp
	SRCS_MIN+= AsmParser/LLLexer.cpp
	SRCS_MIN+= AsmParser/LLParser.cpp
	SRCS_MIN+= AsmParser/Parser.cpp
	SRCS_MIN+= BinaryFormat/Dwarf.cpp
	SRCS_MIN+= BinaryFormat/Magic.cpp
	SRCS_MIN+= BinaryFormat/Wasm.cpp
	SRCS_MIN+= BinaryFormat/XCOFF.cpp
	SRCS_MIN+= Bitcode/Reader/BitReader.cpp
	SRCS_EXT+= Bitcode/Reader/BitcodeAnalyzer.cpp
	SRCS_MIN+= Bitcode/Reader/BitcodeReader.cpp
	SRCS_MIN+= Bitcode/Reader/MetadataLoader.cpp
	SRCS_MIN+= Bitcode/Reader/ValueList.cpp
	SRCS_MIN+= Bitcode/Writer/BitcodeWriter.cpp
	SRCS_MIN+= Bitcode/Writer/BitcodeWriterPass.cpp
	SRCS_MIN+= Bitcode/Writer/ValueEnumerator.cpp
	SRCS_MIN+= Bitstream/Reader/BitstreamReader.cpp
	SRCS_MIN+= CodeGen/AggressiveAntiDepBreaker.cpp
	SRCS_MIN+= CodeGen/AllocationOrder.cpp
	SRCS_MIN+= CodeGen/Analysis.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/ARMException.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AccelTable.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AddressPool.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AsmPrinter.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/CodeViewDebug.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DIE.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DIEHash.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DebugHandlerBase.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DebugLocStream.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfCFIException.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfCompileUnit.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfDebug.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfExpression.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfFile.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfStringPool.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfUnit.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/EHStreamer.cpp
	SRCS_EXT+= CodeGen/AsmPrinter/ErlangGCPrinter.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/OcamlGCPrinter.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/WasmException.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/WinCFGuard.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/WinException.cpp
	SRCS_MIN+= CodeGen/AtomicExpandPass.cpp
	SRCS_MIN+= CodeGen/BasicTargetTransformInfo.cpp
	SRCS_MIN+= CodeGen/BranchFolding.cpp
	SRCS_MIN+= CodeGen/BranchRelaxation.cpp
	SRCS_MIN+= CodeGen/BreakFalseDeps.cpp
	SRCS_EXT+= CodeGen/BuiltinGCs.cpp
	SRCS_MIN+= CodeGen/CFGuardLongjmp.cpp
	SRCS_MIN+= CodeGen/CFIInstrInserter.cpp
	SRCS_MIN+= CodeGen/CalcSpillWeights.cpp
	SRCS_MIN+= CodeGen/CallingConvLower.cpp
	SRCS_MIN+= CodeGen/CodeGen.cpp
	SRCS_MIN+= CodeGen/CodeGenPrepare.cpp
	SRCS_MIN+= CodeGen/CriticalAntiDepBreaker.cpp
	SRCS_MIN+= CodeGen/DFAPacketizer.cpp
	SRCS_MIN+= CodeGen/DeadMachineInstructionElim.cpp
	SRCS_MIN+= CodeGen/DetectDeadLanes.cpp
	SRCS_MIN+= CodeGen/DwarfEHPrepare.cpp
	SRCS_MIN+= CodeGen/EarlyIfConversion.cpp
	SRCS_MIN+= CodeGen/EdgeBundles.cpp
	SRCS_MIN+= CodeGen/ExecutionDomainFix.cpp
	SRCS_MIN+= CodeGen/ExpandMemCmp.cpp
	SRCS_MIN+= CodeGen/ExpandPostRAPseudos.cpp
	SRCS_MIN+= CodeGen/ExpandReductions.cpp
	SRCS_MIN+= CodeGen/FEntryInserter.cpp
	SRCS_MIN+= CodeGen/FaultMaps.cpp
	SRCS_MIN+= CodeGen/FinalizeISel.cpp
	SRCS_MIN+= CodeGen/FuncletLayout.cpp
	SRCS_MIN+= CodeGen/GCMetadata.cpp
	SRCS_MIN+= CodeGen/GCMetadataPrinter.cpp
	SRCS_MIN+= CodeGen/GCRootLowering.cpp
	SRCS_MIN+= CodeGen/GCStrategy.cpp
	SRCS_MIN+= CodeGen/GlobalISel/CSEInfo.cpp
	SRCS_MIN+= CodeGen/GlobalISel/CSEMIRBuilder.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Combiner.cpp
	SRCS_MIN+= CodeGen/GlobalISel/CombinerHelper.cpp
	SRCS_MIN+= CodeGen/GlobalISel/CallLowering.cpp
	SRCS_MIN+= CodeGen/GlobalISel/GISelChangeObserver.cpp
	SRCS_MIN+= CodeGen/GlobalISel/GISelKnownBits.cpp
	SRCS_MIN+= CodeGen/GlobalISel/GlobalISel.cpp
	SRCS_MIN+= CodeGen/GlobalISel/IRTranslator.cpp
	SRCS_MIN+= CodeGen/GlobalISel/InstructionSelect.cpp
	SRCS_MIN+= CodeGen/GlobalISel/InstructionSelector.cpp
	SRCS_MIN+= CodeGen/GlobalISel/LegalityPredicates.cpp
	SRCS_MIN+= CodeGen/GlobalISel/LegalizeMutations.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Legalizer.cpp
	SRCS_MIN+= CodeGen/GlobalISel/LegalizerHelper.cpp
	SRCS_MIN+= CodeGen/GlobalISel/LegalizerInfo.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Localizer.cpp
	SRCS_MIN+= CodeGen/GlobalISel/MachineIRBuilder.cpp
	SRCS_MIN+= CodeGen/GlobalISel/RegBankSelect.cpp
	SRCS_MIN+= CodeGen/GlobalISel/RegisterBank.cpp
	SRCS_MIN+= CodeGen/GlobalISel/RegisterBankInfo.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Utils.cpp
	SRCS_MIN+= CodeGen/GlobalMerge.cpp
	SRCS_MIN+= CodeGen/HardwareLoops.cpp
	SRCS_MIN+= CodeGen/IfConversion.cpp
	SRCS_MIN+= CodeGen/ImplicitNullChecks.cpp
	SRCS_MIN+= CodeGen/IndirectBrExpandPass.cpp
	SRCS_MIN+= CodeGen/InlineSpiller.cpp
	SRCS_MIN+= CodeGen/InterferenceCache.cpp
	SRCS_MIN+= CodeGen/InterleavedAccessPass.cpp
	SRCS_MIN+= CodeGen/InterleavedLoadCombinePass.cpp
	SRCS_MIN+= CodeGen/IntrinsicLowering.cpp
	SRCS_MIN+= CodeGen/LLVMTargetMachine.cpp
	SRCS_MIN+= CodeGen/LatencyPriorityQueue.cpp
	SRCS_MIN+= CodeGen/LazyMachineBlockFrequencyInfo.cpp
	SRCS_MIN+= CodeGen/LexicalScopes.cpp
	SRCS_MIN+= CodeGen/LiveDebugValues.cpp
	SRCS_MIN+= CodeGen/LiveDebugVariables.cpp
	SRCS_MIN+= CodeGen/LiveInterval.cpp
	SRCS_MIN+= CodeGen/LiveIntervalUnion.cpp
	SRCS_MIN+= CodeGen/LiveIntervals.cpp
	SRCS_MIN+= CodeGen/LivePhysRegs.cpp
	SRCS_MIN+= CodeGen/LiveRangeCalc.cpp
	SRCS_MIN+= CodeGen/LiveRangeEdit.cpp
	SRCS_MIN+= CodeGen/LiveRangeShrink.cpp
	SRCS_MIN+= CodeGen/LiveRegMatrix.cpp
	SRCS_MIN+= CodeGen/LiveRegUnits.cpp
	SRCS_MIN+= CodeGen/LiveStacks.cpp
	SRCS_MIN+= CodeGen/LiveVariables.cpp
	SRCS_MIN+= CodeGen/LocalStackSlotAllocation.cpp
	SRCS_MIN+= CodeGen/LoopTraversal.cpp
	SRCS_MIN+= CodeGen/LowLevelType.cpp
	SRCS_MIN+= CodeGen/LowerEmuTLS.cpp
	SRCS_MIN+= CodeGen/MIRCanonicalizerPass.cpp
	SRCS_MIN+= CodeGen/MIRNamerPass.cpp
	SRCS_EXT+= CodeGen/MIRParser/MILexer.cpp
	SRCS_EXT+= CodeGen/MIRParser/MIParser.cpp
	SRCS_EXT+= CodeGen/MIRParser/MIRParser.cpp
	SRCS_MIN+= CodeGen/MIRPrinter.cpp
	SRCS_MIN+= CodeGen/MIRPrintingPass.cpp
	SRCS_MIN+= CodeGen/MIRVRegNamerUtils.cpp
	SRCS_MIN+= CodeGen/MachineBasicBlock.cpp
	SRCS_MIN+= CodeGen/MachineBlockFrequencyInfo.cpp
	SRCS_MIN+= CodeGen/MachineBlockPlacement.cpp
	SRCS_MIN+= CodeGen/MachineBranchProbabilityInfo.cpp
	SRCS_MIN+= CodeGen/MachineCSE.cpp
	SRCS_MIN+= CodeGen/MachineCombiner.cpp
	SRCS_MIN+= CodeGen/MachineCopyPropagation.cpp
	SRCS_MIN+= CodeGen/MachineDominanceFrontier.cpp
	SRCS_MIN+= CodeGen/MachineDominators.cpp
	SRCS_MIN+= CodeGen/MachineFrameInfo.cpp
	SRCS_MIN+= CodeGen/MachineFunction.cpp
	SRCS_MIN+= CodeGen/MachineFunctionPass.cpp
	SRCS_MIN+= CodeGen/MachineFunctionPrinterPass.cpp
	SRCS_MIN+= CodeGen/MachineInstr.cpp
	SRCS_MIN+= CodeGen/MachineInstrBundle.cpp
	SRCS_MIN+= CodeGen/MachineLICM.cpp
	SRCS_MIN+= CodeGen/MachineLoopInfo.cpp
	SRCS_MIN+= CodeGen/MachineLoopUtils.cpp
	SRCS_MIN+= CodeGen/MachineModuleInfo.cpp
	SRCS_MIN+= CodeGen/MachineModuleInfoImpls.cpp
	SRCS_MIN+= CodeGen/MachineOperand.cpp
	SRCS_MIN+= CodeGen/MachineOptimizationRemarkEmitter.cpp
	SRCS_MIN+= CodeGen/MachineOutliner.cpp
	SRCS_MIN+= CodeGen/MachinePipeliner.cpp
	SRCS_MIN+= CodeGen/MachinePostDominators.cpp
	SRCS_MIN+= CodeGen/MachineRegionInfo.cpp
	SRCS_MIN+= CodeGen/MachineRegisterInfo.cpp
	SRCS_MIN+= CodeGen/MachineSSAUpdater.cpp
	SRCS_MIN+= CodeGen/MachineScheduler.cpp
	SRCS_MIN+= CodeGen/MachineSink.cpp
	SRCS_MIN+= CodeGen/MachineSizeOpts.cpp
	SRCS_MIN+= CodeGen/MachineTraceMetrics.cpp
	SRCS_MIN+= CodeGen/MachineVerifier.cpp
	SRCS_MIN+= CodeGen/MacroFusion.cpp
	SRCS_MIN+= CodeGen/ModuloSchedule.cpp
	SRCS_MIN+= CodeGen/OptimizePHIs.cpp
	SRCS_MIN+= CodeGen/PHIElimination.cpp
	SRCS_MIN+= CodeGen/PHIEliminationUtils.cpp
	SRCS_MIN+= CodeGen/ParallelCG.cpp
	SRCS_MIN+= CodeGen/PatchableFunction.cpp
	SRCS_MIN+= CodeGen/PeepholeOptimizer.cpp
	SRCS_MIN+= CodeGen/PostRAHazardRecognizer.cpp
	SRCS_MIN+= CodeGen/PostRASchedulerList.cpp
	SRCS_MIN+= CodeGen/PreISelIntrinsicLowering.cpp
	SRCS_MIN+= CodeGen/ProcessImplicitDefs.cpp
	SRCS_MIN+= CodeGen/PrologEpilogInserter.cpp
	SRCS_MIN+= CodeGen/PseudoSourceValue.cpp
	SRCS_MIN+= CodeGen/ReachingDefAnalysis.cpp
	+SRCS_MIN+= CodeGen/RDFGraph.cpp
	+SRCS_MIN+= CodeGen/RDFLiveness.cpp
	+SRCS_MIN+= CodeGen/RDFRegisters.cpp
	SRCS_MIN+= CodeGen/RegAllocBase.cpp
	SRCS_MIN+= CodeGen/RegAllocBasic.cpp
	SRCS_MIN+= CodeGen/RegAllocFast.cpp
	SRCS_MIN+= CodeGen/RegAllocGreedy.cpp
	SRCS_MIN+= CodeGen/RegAllocPBQP.cpp
	SRCS_MIN+= CodeGen/RegUsageInfoCollector.cpp
	SRCS_MIN+= CodeGen/RegUsageInfoPropagate.cpp
	SRCS_MIN+= CodeGen/RegisterClassInfo.cpp
	SRCS_MIN+= CodeGen/RegisterCoalescer.cpp
	SRCS_MIN+= CodeGen/RegisterPressure.cpp
	SRCS_MIN+= CodeGen/RegisterScavenging.cpp
	SRCS_MIN+= CodeGen/RegisterUsageInfo.cpp
	SRCS_MIN+= CodeGen/RenameIndependentSubregs.cpp
	SRCS_MIN+= CodeGen/ResetMachineFunctionPass.cpp
	SRCS_MIN+= CodeGen/SafeStack.cpp
	SRCS_MIN+= CodeGen/SafeStackColoring.cpp
	SRCS_MIN+= CodeGen/SafeStackLayout.cpp
	SRCS_MIN+= CodeGen/ScalarizeMaskedMemIntrin.cpp
	SRCS_MIN+= CodeGen/ScheduleDAG.cpp
	SRCS_MIN+= CodeGen/ScheduleDAGInstrs.cpp
	SRCS_MIN+= CodeGen/ScheduleDAGPrinter.cpp
	SRCS_MIN+= CodeGen/ScoreboardHazardRecognizer.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/DAGCombiner.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/FastISel.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/InstrEmitter.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeDAG.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeVectorOps.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGFast.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAG.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGDumper.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGISel.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/StatepointLowering.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/TargetLowering.cpp
	SRCS_MIN+= CodeGen/ShadowStackGCLowering.cpp
	SRCS_MIN+= CodeGen/ShrinkWrap.cpp
	SRCS_MIN+= CodeGen/SjLjEHPrepare.cpp
	SRCS_MIN+= CodeGen/SlotIndexes.cpp
	SRCS_MIN+= CodeGen/SpillPlacement.cpp
	SRCS_MIN+= CodeGen/SplitKit.cpp
	SRCS_MIN+= CodeGen/StackColoring.cpp
	SRCS_MIN+= CodeGen/StackMapLivenessAnalysis.cpp
	SRCS_MIN+= CodeGen/StackMaps.cpp
	SRCS_MIN+= CodeGen/StackProtector.cpp
	SRCS_MIN+= CodeGen/StackSlotColoring.cpp
	SRCS_MIN+= CodeGen/SwiftErrorValueTracking.cpp
	SRCS_MIN+= CodeGen/SwitchLoweringUtils.cpp
	SRCS_MIN+= CodeGen/TailDuplication.cpp
	SRCS_MIN+= CodeGen/TailDuplicator.cpp
	SRCS_MIN+= CodeGen/TargetFrameLoweringImpl.cpp
	SRCS_MIN+= CodeGen/TargetInstrInfo.cpp
	SRCS_MIN+= CodeGen/TargetLoweringBase.cpp
	SRCS_MIN+= CodeGen/TargetLoweringObjectFileImpl.cpp
	SRCS_MIN+= CodeGen/TargetOptionsImpl.cpp
	SRCS_MIN+= CodeGen/TargetPassConfig.cpp
	SRCS_MIN+= CodeGen/TargetRegisterInfo.cpp
	SRCS_MIN+= CodeGen/TargetSchedule.cpp
	SRCS_MIN+= CodeGen/TargetSubtargetInfo.cpp
	SRCS_MIN+= CodeGen/TwoAddressInstructionPass.cpp
	SRCS_MIN+= CodeGen/TypePromotion.cpp
	SRCS_MIN+= CodeGen/UnreachableBlockElim.cpp
	SRCS_MIN+= CodeGen/ValueTypes.cpp
	SRCS_MIN+= CodeGen/VirtRegMap.cpp
	SRCS_MIN+= CodeGen/WasmEHPrepare.cpp
	SRCS_MIN+= CodeGen/WinEHPrepare.cpp
	SRCS_MIN+= CodeGen/XRayInstrumentation.cpp
	SRCS_EXT+= DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
	SRCS_MIN+= DebugInfo/CodeView/CVSymbolVisitor.cpp
	SRCS_MIN+= DebugInfo/CodeView/CVTypeVisitor.cpp
	SRCS_MIN+= DebugInfo/CodeView/CodeViewError.cpp
	SRCS_MIN+= DebugInfo/CodeView/CodeViewRecordIO.cpp
	SRCS_MIN+= DebugInfo/CodeView/ContinuationRecordBuilder.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugChecksumsSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugCrossExSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugCrossImpSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugFrameDataSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugLinesSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugStringTableSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSubsectionRecord.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSubsectionVisitor.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSymbolsSubsection.cpp
	SRCS_MIN+= DebugInfo/CodeView/EnumTables.cpp
	SRCS_MIN+= DebugInfo/CodeView/Formatters.cpp
	SRCS_MIN+= DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
	SRCS_EXT+= DebugInfo/CodeView/LazyRandomTypeCollection.cpp
	SRCS_MIN+= DebugInfo/CodeView/Line.cpp
	SRCS_EXT+= DebugInfo/CodeView/MergingTypeTableBuilder.cpp
	SRCS_MIN+= DebugInfo/CodeView/RecordName.cpp
	SRCS_MIN+= DebugInfo/CodeView/RecordSerialization.cpp
	SRCS_MIN+= DebugInfo/CodeView/SimpleTypeSerializer.cpp
	SRCS_EXT+= DebugInfo/CodeView/StringsAndChecksums.cpp
	SRCS_MIN+= DebugInfo/CodeView/SymbolDumper.cpp
	SRCS_MIN+= DebugInfo/CodeView/SymbolRecordMapping.cpp
	SRCS_EXT+= DebugInfo/CodeView/SymbolSerializer.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeDumpVisitor.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeHashing.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeIndex.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeIndexDiscovery.cpp
	SRCS_EXT+= DebugInfo/CodeView/TypeRecordHelpers.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeRecordMapping.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeStreamMerger.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeTableCollection.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFAcceleratorTable.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFAddressRange.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFCompileUnit.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFContext.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDataExtractor.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugAbbrev.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugAddr.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugArangeSet.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugAranges.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugFrame.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugLine.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugLoc.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugMacro.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugPubTable.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugRangeList.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugRnglists.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDie.cpp
	SRCS_MIN+= DebugInfo/DWARF/DWARFExpression.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFFormValue.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFGdbIndex.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFListTable.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFTypeUnit.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFUnit.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFUnitIndex.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFVerifier.cpp
	SRCS_MIN+= DebugInfo/MSF/MSFBuilder.cpp
	SRCS_MIN+= DebugInfo/MSF/MSFCommon.cpp
	SRCS_EXT+= DebugInfo/MSF/MSFError.cpp
	SRCS_MIN+= DebugInfo/MSF/MappedBlockStream.cpp
	SRCS_EXT+= DebugInfo/PDB/GenericError.cpp
	SRCS_EXT+= DebugInfo/PDB/IPDBSourceFile.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiModuleList.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/EnumTables.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/GSIStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/GlobalsStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/Hash.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/HashTable.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/InfoStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/InfoStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/InjectedSourceStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/ModuleDebugStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NamedStreamMap.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumGlobals.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumModules.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumTypes.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeExeSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeRawSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeSession.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeArray.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeBuiltin.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeEnum.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypePointer.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeTypedef.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeUDT.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeTypeVTShape.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBFile.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBFileBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBStringTable.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PublicsStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/RawError.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/SymbolCache.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/SymbolStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/TpiHashing.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/TpiStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/TpiStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/PDB.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBContext.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBExtras.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBInterfaceAnchors.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymDumper.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolAnnotation.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolBlock.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCompiland.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCustom.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolData.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolExe.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolFunc.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolLabel.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolThunk.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeArray.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeCustom.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeDimension.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeEnum.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeFriend.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeManaged.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypePointer.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeUDT.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeVTable.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolUnknown.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
	SRCS_EXT+= DebugInfo/PDB/UDTLayout.cpp
	SRCS_MIW+= DebugInfo/Symbolize/DIPrinter.cpp
	SRCS_MIW+= DebugInfo/Symbolize/SymbolizableObjectFile.cpp
	SRCS_MIW+= DebugInfo/Symbolize/Symbolize.cpp
	SRCS_MIW+= Demangle/Demangle.cpp
	SRCS_MIN+= Demangle/ItaniumDemangle.cpp
	SRCS_MIW+= Demangle/MicrosoftDemangle.cpp
	SRCS_MIW+= Demangle/MicrosoftDemangleNodes.cpp
	SRCS_XDB+= ExecutionEngine/ExecutionEngine.cpp
	SRCS_XDB+= ExecutionEngine/ExecutionEngineBindings.cpp
	SRCS_XDB+= ExecutionEngine/GDBRegistrationListener.cpp
	SRCS_XDB+= ExecutionEngine/Interpreter/Execution.cpp
	SRCS_XDB+= ExecutionEngine/Interpreter/ExternalFunctions.cpp
	SRCS_XDB+= ExecutionEngine/Interpreter/Interpreter.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/EHFrameSupport.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/JITLink.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/JITLinkGeneric.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/MachO.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/MachO_arm64.cpp
	SRCS_EXT+= ExecutionEngine/JITLink/MachO_x86_64.cpp
	SRCS_XDB+= ExecutionEngine/MCJIT/MCJIT.cpp
	SRCS_EXT+= ExecutionEngine/Orc/CompileOnDemandLayer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/CompileUtils.cpp
	SRCS_EXT+= ExecutionEngine/Orc/Core.cpp
	SRCS_EXT+= ExecutionEngine/Orc/ExecutionUtils.cpp
	SRCS_EXT+= ExecutionEngine/Orc/IRCompileLayer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/IRTransformLayer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/IndirectionUtils.cpp
	SRCS_EXT+= ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
	SRCS_EXT+= ExecutionEngine/Orc/LLJIT.cpp
	SRCS_EXT+= ExecutionEngine/Orc/Layer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/LazyReexports.cpp
	SRCS_EXT+= ExecutionEngine/Orc/Legacy.cpp
	SRCS_EXT+= ExecutionEngine/Orc/NullResolver.cpp
	SRCS_EXT+= ExecutionEngine/Orc/ObjectLinkingLayer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/ObjectTransformLayer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcABISupport.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcCBindings.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcMCJITReplacement.cpp
	SRCS_EXT+= ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
	SRCS_EXT+= ExecutionEngine/Orc/Speculation.cpp
	SRCS_EXT+= ExecutionEngine/Orc/ThreadSafeModule.cpp
	SRCS_EXT+= ExecutionEngine/OrcError/OrcError.cpp
	SRCS_EXT+= ExecutionEngine/OrcError/RPCError.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/JITSymbol.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
	SRCS_XDB+= ExecutionEngine/SectionMemoryManager.cpp
	SRCS_XDB+= ExecutionEngine/TargetSelect.cpp
	SRCS_MIN+= Frontend/OpenMP/OMPConstants.cpp
	SRCS_MIN+= Frontend/OpenMP/OMPIRBuilder.cpp
	SRCS_MIN+= IR/AbstractCallSite.cpp
	SRCS_MIN+= IR/AsmWriter.cpp
	SRCS_MIN+= IR/Attributes.cpp
	SRCS_MIN+= IR/AutoUpgrade.cpp
	SRCS_MIN+= IR/BasicBlock.cpp
	SRCS_MIN+= IR/Comdat.cpp
	SRCS_MIN+= IR/ConstantFold.cpp
	SRCS_MIN+= IR/ConstantRange.cpp
	SRCS_MIN+= IR/Constants.cpp
	SRCS_MIN+= IR/Core.cpp
	SRCS_MIN+= IR/DIBuilder.cpp
	SRCS_MIN+= IR/DataLayout.cpp
	SRCS_MIN+= IR/DebugInfo.cpp
	SRCS_MIN+= IR/DebugInfoMetadata.cpp
	SRCS_MIN+= IR/DebugLoc.cpp
	SRCS_MIN+= IR/DiagnosticHandler.cpp
	SRCS_MIN+= IR/DiagnosticInfo.cpp
	SRCS_MIN+= IR/DiagnosticPrinter.cpp
	SRCS_MIN+= IR/Dominators.cpp
	SRCS_MIN+= IR/FPEnv.cpp
	SRCS_MIN+= IR/Function.cpp
	SRCS_MIN+= IR/GVMaterializer.cpp
	SRCS_MIN+= IR/Globals.cpp
	SRCS_MIN+= IR/IRBuilder.cpp
	SRCS_MIN+= IR/IRPrintingPasses.cpp
	SRCS_MIN+= IR/InlineAsm.cpp
	SRCS_MIN+= IR/Instruction.cpp
	SRCS_MIN+= IR/Instructions.cpp
	SRCS_MIN+= IR/IntrinsicInst.cpp
	SRCS_MIN+= IR/LLVMContext.cpp
	SRCS_MIN+= IR/LLVMContextImpl.cpp
	SRCS_MIN+= IR/LegacyPassManager.cpp
	SRCS_MIN+= IR/MDBuilder.cpp
	SRCS_MIN+= IR/Mangler.cpp
	SRCS_MIN+= IR/Metadata.cpp
	SRCS_MIN+= IR/Module.cpp
	SRCS_MIN+= IR/ModuleSummaryIndex.cpp
	SRCS_MIN+= IR/Operator.cpp
	SRCS_MIN+= IR/OptBisect.cpp
	SRCS_MIN+= IR/Pass.cpp
	SRCS_MIN+= IR/PassInstrumentation.cpp
	SRCS_MIN+= IR/PassManager.cpp
	SRCS_MIN+= IR/PassRegistry.cpp
	SRCS_MIN+= IR/PassTimingInfo.cpp
	SRCS_MIN+= IR/ProfileSummary.cpp
	SRCS_MIN+= IR/RemarkStreamer.cpp
	SRCS_MIN+= IR/SafepointIRVerifier.cpp
	SRCS_MIN+= IR/Statepoint.cpp
	SRCS_MIN+= IR/Type.cpp
	SRCS_MIN+= IR/TypeFinder.cpp
	SRCS_MIN+= IR/Use.cpp
	SRCS_MIN+= IR/User.cpp
	SRCS_MIN+= IR/Value.cpp
	SRCS_MIN+= IR/ValueSymbolTable.cpp
	SRCS_MIN+= IR/Verifier.cpp
	SRCS_MIN+= IRReader/IRReader.cpp
	SRCS_EXL+= LTO/Caching.cpp
	SRCS_MIN+= LTO/LTO.cpp
	SRCS_MIN+= LTO/LTOBackend.cpp
	SRCS_EXL+= LTO/LTOCodeGenerator.cpp
	SRCS_EXL+= LTO/LTOModule.cpp
	SRCS_EXL+= LTO/SummaryBasedOptimizations.cpp
	SRCS_EXL+= LTO/ThinLTOCodeGenerator.cpp
	SRCS_MIN+= LTO/UpdateCompilerUsed.cpp
	SRCS_MIN+= LineEditor/LineEditor.cpp
	SRCS_MIN+= Linker/IRMover.cpp
	SRCS_MIN+= Linker/LinkModules.cpp
	SRCS_MIN+= MC/ConstantPools.cpp
	SRCS_MIN+= MC/ELFObjectWriter.cpp
	SRCS_MIN+= MC/MCAsmBackend.cpp
	SRCS_MIN+= MC/MCAsmInfo.cpp
	SRCS_MIN+= MC/MCAsmInfoCOFF.cpp
	SRCS_MIN+= MC/MCAsmInfoDarwin.cpp
	SRCS_MIN+= MC/MCAsmInfoELF.cpp
	SRCS_MIN+= MC/MCAsmInfoXCOFF.cpp
	SRCS_MIN+= MC/MCAsmMacro.cpp
	SRCS_MIN+= MC/MCAsmStreamer.cpp
	SRCS_MIN+= MC/MCAssembler.cpp
	SRCS_MIN+= MC/MCCodeEmitter.cpp
	SRCS_MIN+= MC/MCCodeView.cpp
	SRCS_MIN+= MC/MCContext.cpp
	SRCS_XDL+= MC/MCDisassembler/Disassembler.cpp
	SRCS_XDW+= MC/MCDisassembler/MCDisassembler.cpp
	SRCS_XDW+= MC/MCDisassembler/MCExternalSymbolizer.cpp
	SRCS_MIN+= MC/MCDisassembler/MCRelocationInfo.cpp
	SRCS_XDW+= MC/MCDisassembler/MCSymbolizer.cpp
	SRCS_MIN+= MC/MCDwarf.cpp
	SRCS_MIN+= MC/MCELFObjectTargetWriter.cpp
	SRCS_MIN+= MC/MCELFStreamer.cpp
	SRCS_MIN+= MC/MCExpr.cpp
	SRCS_MIN+= MC/MCFragment.cpp
	SRCS_MIN+= MC/MCInst.cpp
	SRCS_MIN+= MC/MCInstPrinter.cpp
	SRCS_MIN+= MC/MCInstrAnalysis.cpp
	SRCS_MIN+= MC/MCInstrDesc.cpp
	SRCS_MIN+= MC/MCLinkerOptimizationHint.cpp
	SRCS_MIN+= MC/MCMachOStreamer.cpp
	SRCS_MIN+= MC/MCMachObjectTargetWriter.cpp
	SRCS_MIN+= MC/MCNullStreamer.cpp
	SRCS_MIN+= MC/MCObjectFileInfo.cpp
	SRCS_MIN+= MC/MCObjectStreamer.cpp
	SRCS_MIN+= MC/MCObjectWriter.cpp
	SRCS_MIN+= MC/MCParser/AsmLexer.cpp
	SRCS_MIN+= MC/MCParser/AsmParser.cpp
	SRCS_MIN+= MC/MCParser/COFFAsmParser.cpp
	SRCS_MIN+= MC/MCParser/DarwinAsmParser.cpp
	SRCS_MIN+= MC/MCParser/ELFAsmParser.cpp
	SRCS_MIN+= MC/MCParser/MCAsmLexer.cpp
	SRCS_MIN+= MC/MCParser/MCAsmParser.cpp
	SRCS_MIN+= MC/MCParser/MCAsmParserExtension.cpp
	SRCS_MIN+= MC/MCParser/MCTargetAsmParser.cpp
	SRCS_MIN+= MC/MCParser/WasmAsmParser.cpp
	SRCS_MIN+= MC/MCRegisterInfo.cpp
	SRCS_MIN+= MC/MCSchedule.cpp
	SRCS_MIN+= MC/MCSection.cpp
	SRCS_MIN+= MC/MCSectionCOFF.cpp
	SRCS_MIN+= MC/MCSectionELF.cpp
	SRCS_MIN+= MC/MCSectionMachO.cpp
	SRCS_MIN+= MC/MCSectionWasm.cpp
	SRCS_MIN+= MC/MCSectionXCOFF.cpp
	SRCS_MIN+= MC/MCStreamer.cpp
	SRCS_MIN+= MC/MCSubtargetInfo.cpp
	SRCS_MIN+= MC/MCSymbol.cpp
	SRCS_MIN+= MC/MCSymbolELF.cpp
	SRCS_MIN+= MC/MCTargetOptions.cpp
	SRCS_MIN+= MC/MCValue.cpp
	SRCS_MIN+= MC/MCWasmStreamer.cpp
	SRCS_MIN+= MC/MCWin64EH.cpp
	SRCS_MIN+= MC/MCWinCOFFStreamer.cpp
	SRCS_MIN+= MC/MCWinEH.cpp
	SRCS_MIN+= MC/MCXCOFFStreamer.cpp
	SRCS_MIN+= MC/MCXCOFFObjectTargetWriter.cpp
	SRCS_MIN+= MC/MachObjectWriter.cpp
	SRCS_MIN+= MC/StringTableBuilder.cpp
	SRCS_MIN+= MC/SubtargetFeature.cpp
	SRCS_MIN+= MC/WasmObjectWriter.cpp
	SRCS_MIN+= MC/WinCOFFObjectWriter.cpp
	SRCS_MIN+= MC/XCOFFObjectWriter.cpp
	SRCS_EXT+= MCA/CodeEmitter.cpp
	SRCS_EXT+= MCA/Context.cpp
	SRCS_EXT+= MCA/HWEventListener.cpp
	SRCS_EXT+= MCA/HardwareUnits/HardwareUnit.cpp
	SRCS_EXT+= MCA/HardwareUnits/LSUnit.cpp
	SRCS_EXT+= MCA/HardwareUnits/RegisterFile.cpp
	SRCS_EXT+= MCA/HardwareUnits/ResourceManager.cpp
	SRCS_EXT+= MCA/HardwareUnits/RetireControlUnit.cpp
	SRCS_EXT+= MCA/HardwareUnits/Scheduler.cpp
	SRCS_EXT+= MCA/InstrBuilder.cpp
	SRCS_EXT+= MCA/Instruction.cpp
	SRCS_EXT+= MCA/Pipeline.cpp
	SRCS_EXT+= MCA/Stages/DispatchStage.cpp
	SRCS_EXT+= MCA/Stages/EntryStage.cpp
	SRCS_EXT+= MCA/Stages/ExecuteStage.cpp
	SRCS_EXT+= MCA/Stages/InstructionTables.cpp
	SRCS_EXT+= MCA/Stages/MicroOpQueueStage.cpp
	SRCS_EXT+= MCA/Stages/RetireStage.cpp
	SRCS_EXT+= MCA/Stages/Stage.cpp
	SRCS_EXT+= MCA/Support.cpp
	SRCS_MIN+= Object/Archive.cpp
	SRCS_MIN+= Object/ArchiveWriter.cpp
	SRCS_MIN+= Object/Binary.cpp
	SRCS_MIN+= Object/COFFImportFile.cpp
	SRCS_MIW+= Object/COFFModuleDefinition.cpp
	SRCS_MIN+= Object/COFFObjectFile.cpp
	SRCS_MIN+= Object/Decompressor.cpp
	SRCS_MIN+= Object/ELF.cpp
	SRCS_MIN+= Object/ELFObjectFile.cpp
	SRCS_MIN+= Object/Error.cpp
	SRCS_MIN+= Object/IRObjectFile.cpp
	SRCS_MIN+= Object/IRSymtab.cpp
	SRCS_MIN+= Object/MachOObjectFile.cpp
	SRCS_MIW+= Object/MachOUniversal.cpp
	SRCS_MIW+= Object/Minidump.cpp
	SRCS_MIN+= Object/ModuleSymbolTable.cpp
	SRCS_EXT+= Object/Object.cpp
	SRCS_MIN+= Object/ObjectFile.cpp
	SRCS_MIN+= Object/RecordStreamer.cpp
	SRCS_MIW+= Object/RelocationResolver.cpp
	SRCS_MIW+= Object/SymbolSize.cpp
	SRCS_MIN+= Object/SymbolicFile.cpp
	SRCS_MIW+= Object/TapiUniversal.cpp
	SRCS_MIN+= Object/WasmObjectFile.cpp
	SRCS_MIW+= Object/WindowsMachineFlag.cpp
	SRCS_MIN+= Object/WindowsResource.cpp
	SRCS_MIN+= Object/XCOFFObjectFile.cpp
	SRCS_MIN+= ObjectYAML/COFFYAML.cpp
	SRCS_EXT+= ObjectYAML/CodeViewYAMLDebugSections.cpp
	SRCS_EXT+= ObjectYAML/CodeViewYAMLSymbols.cpp
	SRCS_EXT+= ObjectYAML/CodeViewYAMLTypes.cpp
	SRCS_MIN+= ObjectYAML/DWARFYAML.cpp
	SRCS_MIN+= ObjectYAML/ELFYAML.cpp
	SRCS_MIN+= ObjectYAML/MachOYAML.cpp
	SRCS_EXT+= ObjectYAML/YAML.cpp
	SRCS_MIN+= Option/Arg.cpp
	SRCS_MIN+= Option/ArgList.cpp
	SRCS_MIN+= Option/OptTable.cpp
	SRCS_MIN+= Option/Option.cpp
	SRCS_MIN+= Passes/PassBuilder.cpp
	SRCS_MIN+= Passes/PassPlugin.cpp
	SRCS_MIN+= Passes/StandardInstrumentations.cpp
	SRCS_MIN+= ProfileData/Coverage/CoverageMapping.cpp
	SRCS_MIN+= ProfileData/Coverage/CoverageMappingReader.cpp
	SRCS_MIN+= ProfileData/Coverage/CoverageMappingWriter.cpp
	SRCS_MIN+= ProfileData/GCOV.cpp
	SRCS_MIN+= ProfileData/InstrProf.cpp
	SRCS_MIN+= ProfileData/InstrProfReader.cpp
	SRCS_MIN+= ProfileData/InstrProfWriter.cpp
	SRCS_MIN+= ProfileData/ProfileSummaryBuilder.cpp
	SRCS_MIN+= ProfileData/SampleProf.cpp
	SRCS_MIN+= ProfileData/SampleProfReader.cpp
	SRCS_MIN+= ProfileData/SampleProfWriter.cpp
	SRCS_MIN+= Remarks/BitstreamRemarkSerializer.cpp
	SRCS_MIN+= Remarks/RemarkFormat.cpp
	SRCS_MIN+= Remarks/RemarkSerializer.cpp
	SRCS_MIN+= Remarks/RemarkStringTable.cpp
	SRCS_MIN+= Remarks/YAMLRemarkSerializer.cpp
	SRCS_MIN+= Support/AArch64TargetParser.cpp
	SRCS_MIN+= Support/ABIBreak.cpp
	SRCS_MIN+= Support/APFloat.cpp
	SRCS_MIN+= Support/APInt.cpp
	SRCS_MIN+= Support/APSInt.cpp
	SRCS_MIN+= Support/ARMAttributeParser.cpp
	SRCS_MIN+= Support/ARMBuildAttrs.cpp
	SRCS_MIN+= Support/ARMTargetParser.cpp
	SRCS_MIN+= Support/Allocator.cpp
	SRCS_MIN+= Support/BinaryStreamError.cpp
	SRCS_MIN+= Support/BinaryStreamReader.cpp
	SRCS_MIN+= Support/BinaryStreamRef.cpp
	SRCS_MIN+= Support/BinaryStreamWriter.cpp
	SRCS_MIN+= Support/BlockFrequency.cpp
	SRCS_MIN+= Support/BranchProbability.cpp
	SRCS_MIN+= Support/BuryPointer.cpp
	SRCS_MIN+= Support/CachePruning.cpp
	SRCS_MIW+= Support/COM.cpp
	SRCS_MIN+= Support/CRC.cpp
	SRCS_MIN+= Support/Chrono.cpp
	SRCS_MIN+= Support/CodeGenCoverage.cpp
	SRCS_MIN+= Support/CommandLine.cpp
	SRCS_MIN+= Support/Compression.cpp
	SRCS_MIN+= Support/ConvertUTF.cpp
	SRCS_MIN+= Support/ConvertUTFWrapper.cpp
	SRCS_MIN+= Support/CrashRecoveryContext.cpp
	SRCS_MIN+= Support/DAGDeltaAlgorithm.cpp
	SRCS_MIN+= Support/DJB.cpp
	SRCS_MIN+= Support/DataExtractor.cpp
	SRCS_MIN+= Support/Debug.cpp
	SRCS_MIN+= Support/DebugCounter.cpp
	SRCS_MIN+= Support/DeltaAlgorithm.cpp
	SRCS_MIN+= Support/DynamicLibrary.cpp
	SRCS_MIN+= Support/Errno.cpp
	SRCS_MIN+= Support/Error.cpp
	SRCS_MIN+= Support/ErrorHandling.cpp
	SRCS_XDB+= Support/FileCollector.cpp
	SRCS_EXL+= Support/FileOutputBuffer.cpp
	SRCS_MIN+= Support/FileUtilities.cpp
	SRCS_MIN+= Support/FoldingSet.cpp
	SRCS_MIN+= Support/FormatVariadic.cpp
	SRCS_MIN+= Support/FormattedStream.cpp
	SRCS_MIN+= Support/GlobPattern.cpp
	SRCS_MIN+= Support/GraphWriter.cpp
	SRCS_MIN+= Support/Hashing.cpp
	SRCS_MIN+= Support/Host.cpp
	SRCS_MIN+= Support/InitLLVM.cpp
	SRCS_MIN+= Support/IntEqClasses.cpp
	SRCS_MIN+= Support/IntervalMap.cpp
	SRCS_MIN+= Support/ItaniumManglingCanonicalizer.cpp
	SRCS_MIN+= Support/JSON.cpp
	SRCS_MIN+= Support/KnownBits.cpp
	SRCS_MIN+= Support/LEB128.cpp
	SRCS_MIN+= Support/LineIterator.cpp
	SRCS_MIN+= Support/Locale.cpp
	SRCS_MIN+= Support/LockFileManager.cpp
	SRCS_MIN+= Support/LowLevelType.cpp
	SRCS_MIN+= Support/MD5.cpp
	SRCS_MIN+= Support/ManagedStatic.cpp
	SRCS_MIN+= Support/MathExtras.cpp
	SRCS_XDL+= Support/Memory.cpp
	SRCS_MIN+= Support/MemoryBuffer.cpp
	SRCS_MIN+= Support/NativeFormatting.cpp
	SRCS_MIN+= Support/Optional.cpp
	SRCS_LLD+= Support/Parallel.cpp
	SRCS_MIN+= Support/Path.cpp
	SRCS_MIN+= Support/PluginLoader.cpp
	SRCS_MIN+= Support/PrettyStackTrace.cpp
	SRCS_MIN+= Support/Process.cpp
	SRCS_MIN+= Support/Program.cpp
	SRCS_MIN+= Support/RWMutex.cpp
	SRCS_MIN+= Support/RandomNumberGenerator.cpp
	SRCS_MIN+= Support/Regex.cpp
	SRCS_MIN+= Support/SHA1.cpp
	SRCS_MIN+= Support/ScaledNumber.cpp
	SRCS_MIN+= Support/ScopedPrinter.cpp
	SRCS_MIN+= Support/Signals.cpp
	SRCS_MIN+= Support/Signposts.cpp
	SRCS_MIN+= Support/SmallPtrSet.cpp
	SRCS_MIN+= Support/SmallVector.cpp
	SRCS_MIN+= Support/SourceMgr.cpp
	SRCS_MIN+= Support/SpecialCaseList.cpp
	SRCS_MIN+= Support/Statistic.cpp
	SRCS_MIN+= Support/StringExtras.cpp
	SRCS_MIN+= Support/StringMap.cpp
	SRCS_MIN+= Support/StringRef.cpp
	SRCS_MIN+= Support/StringSaver.cpp
	SRCS_MIN+= Support/SymbolRemappingReader.cpp
	SRCS_EXT+= Support/SystemUtils.cpp
	SRCS_LLD+= Support/TarWriter.cpp
	SRCS_MIN+= Support/TargetParser.cpp
	SRCS_MIN+= Support/TargetRegistry.cpp
	SRCS_MIN+= Support/ThreadLocal.cpp
	SRCS_MIW+= Support/ThreadPool.cpp
	SRCS_MIN+= Support/Threading.cpp
	SRCS_MIN+= Support/TimeProfiler.cpp
	SRCS_MIN+= Support/Timer.cpp
	SRCS_MIN+= Support/ToolOutputFile.cpp
	SRCS_MIN+= Support/TrigramIndex.cpp
	SRCS_MIN+= Support/Triple.cpp
	SRCS_MIN+= Support/Twine.cpp
	SRCS_MIN+= Support/Unicode.cpp
	SRCS_MIN+= Support/UnicodeCaseFold.cpp
	SRCS_MIN+= Support/Valgrind.cpp
	SRCS_MIN+= Support/VirtualFileSystem.cpp
	SRCS_MIN+= Support/VersionTuple.cpp
	SRCS_MIN+= Support/Watchdog.cpp
	SRCS_MIN+= Support/WithColor.cpp
	SRCS_MIN+= Support/YAMLParser.cpp
	SRCS_MIN+= Support/YAMLTraits.cpp
	SRCS_FUL+= Support/Z3Solver.cpp
	SRCS_MIN+= Support/circular_raw_ostream.cpp
	SRCS_MIN+= Support/raw_os_ostream.cpp
	SRCS_MIN+= Support/raw_ostream.cpp
	SRCS_MIN+= Support/regcomp.c
	SRCS_MIN+= Support/regerror.c
	SRCS_MIN+= Support/regexec.c
	SRCS_MIN+= Support/regfree.c
	SRCS_MIN+= Support/regstrlcpy.c
	SRCS_MIN+= Support/xxhash.cpp
	SRCS_MIN+= TableGen/Error.cpp
	SRCS_MIN+= TableGen/JSONBackend.cpp
	SRCS_MIN+= TableGen/Main.cpp
	SRCS_MIN+= TableGen/Record.cpp
	SRCS_MIN+= TableGen/SetTheory.cpp
	SRCS_MIN+= TableGen/StringMatcher.cpp
	SRCS_MIN+= TableGen/TGLexer.cpp
	SRCS_MIN+= TableGen/TGParser.cpp
	SRCS_MIN+= TableGen/TableGenBackend.cpp
	.if ${MK_LLVM_TARGET_AARCH64} != "no"
	SRCS_MIN+= Target/AArch64/AArch64A53Fix835769.cpp
	SRCS_MIN+= Target/AArch64/AArch64A57FPLoadBalancing.cpp
	SRCS_MIN+= Target/AArch64/AArch64AdvSIMDScalarPass.cpp
	SRCS_MIN+= Target/AArch64/AArch64AsmPrinter.cpp
	SRCS_MIN+= Target/AArch64/AArch64BranchTargets.cpp
	SRCS_MIN+= Target/AArch64/AArch64CallLowering.cpp
	SRCS_MIN+= Target/AArch64/AArch64CallingConvention.cpp
	SRCS_MIN+= Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
	SRCS_MIN+= Target/AArch64/AArch64CollectLOH.cpp
	SRCS_MIN+= Target/AArch64/AArch64CompressJumpTables.cpp
	SRCS_MIN+= Target/AArch64/AArch64CondBrTuning.cpp
	SRCS_MIN+= Target/AArch64/AArch64ConditionOptimizer.cpp
	SRCS_MIN+= Target/AArch64/AArch64ConditionalCompares.cpp
	SRCS_MIN+= Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
	SRCS_MIN+= Target/AArch64/AArch64ExpandImm.cpp
	SRCS_MIN+= Target/AArch64/AArch64ExpandPseudoInsts.cpp
	SRCS_MIN+= Target/AArch64/AArch64FalkorHWPFFix.cpp
	SRCS_MIN+= Target/AArch64/AArch64FastISel.cpp
	SRCS_MIN+= Target/AArch64/AArch64FrameLowering.cpp
	SRCS_MIN+= Target/AArch64/AArch64ISelDAGToDAG.cpp
	SRCS_MIN+= Target/AArch64/AArch64ISelLowering.cpp
	SRCS_MIN+= Target/AArch64/AArch64InstrInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64InstructionSelector.cpp
	SRCS_MIN+= Target/AArch64/AArch64LegalizerInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64LoadStoreOptimizer.cpp
	SRCS_MIN+= Target/AArch64/AArch64MCInstLower.cpp
	SRCS_MIN+= Target/AArch64/AArch64MacroFusion.cpp
	SRCS_MIN+= Target/AArch64/AArch64PBQPRegAlloc.cpp
	SRCS_MIN+= Target/AArch64/AArch64PreLegalizerCombiner.cpp
	SRCS_MIN+= Target/AArch64/AArch64PromoteConstant.cpp
	SRCS_MIN+= Target/AArch64/AArch64RedundantCopyElimination.cpp
	SRCS_MIN+= Target/AArch64/AArch64RegisterBankInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64RegisterInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64SIMDInstrOpt.cpp
	SRCS_MIN+= Target/AArch64/AArch64SelectionDAGInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64SpeculationHardening.cpp
	SRCS_MIN+= Target/AArch64/AArch64StackTagging.cpp
	SRCS_MIN+= Target/AArch64/AArch64StackTaggingPreRA.cpp
	SRCS_MIN+= Target/AArch64/AArch64StorePairSuppress.cpp
	SRCS_MIN+= Target/AArch64/AArch64Subtarget.cpp
	SRCS_MIN+= Target/AArch64/AArch64TargetMachine.cpp
	SRCS_MIN+= Target/AArch64/AArch64TargetObjectFile.cpp
	SRCS_MIN+= Target/AArch64/AArch64TargetTransformInfo.cpp
	SRCS_MIN+= Target/AArch64/AsmParser/AArch64AsmParser.cpp
	SRCS_XDW+= Target/AArch64/Disassembler/AArch64Disassembler.cpp
	SRCS_XDW+= Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
	SRCS_MIN+= Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
	SRCS_MIN+= Target/AArch64/Utils/AArch64BaseInfo.cpp
	.endif # MK_LLVM_TARGET_AARCH64
	.if ${MK_LLVM_TARGET_ARM} != "no"
	SRCS_MIN+= Target/ARM/A15SDOptimizer.cpp
	SRCS_MIN+= Target/ARM/ARMAsmPrinter.cpp
	SRCS_MIN+= Target/ARM/ARMBaseInstrInfo.cpp
	SRCS_MIN+= Target/ARM/ARMBaseRegisterInfo.cpp
	SRCS_MIN+= Target/ARM/ARMBasicBlockInfo.cpp
	SRCS_MIN+= Target/ARM/ARMCallLowering.cpp
	SRCS_MIN+= Target/ARM/ARMCallingConv.cpp
	SRCS_MIN+= Target/ARM/ARMConstantIslandPass.cpp
	SRCS_MIN+= Target/ARM/ARMConstantPoolValue.cpp
	SRCS_MIN+= Target/ARM/ARMExpandPseudoInsts.cpp
	SRCS_MIN+= Target/ARM/ARMFastISel.cpp
	SRCS_MIN+= Target/ARM/ARMFrameLowering.cpp
	SRCS_MIN+= Target/ARM/ARMHazardRecognizer.cpp
	SRCS_MIN+= Target/ARM/ARMISelDAGToDAG.cpp
	SRCS_MIN+= Target/ARM/ARMISelLowering.cpp
	SRCS_MIN+= Target/ARM/ARMInstrInfo.cpp
	SRCS_MIN+= Target/ARM/ARMInstructionSelector.cpp
	SRCS_MIN+= Target/ARM/ARMLegalizerInfo.cpp
	SRCS_MIN+= Target/ARM/ARMLoadStoreOptimizer.cpp
	SRCS_MIN+= Target/ARM/ARMLowOverheadLoops.cpp
	SRCS_MIN+= Target/ARM/ARMMCInstLower.cpp
	SRCS_MIN+= Target/ARM/ARMMachineFunctionInfo.cpp
	SRCS_MIN+= Target/ARM/ARMMacroFusion.cpp
	SRCS_MIN+= Target/ARM/ARMOptimizeBarriersPass.cpp
	SRCS_MIN+= Target/ARM/ARMParallelDSP.cpp
	SRCS_MIN+= Target/ARM/ARMRegisterBankInfo.cpp
	SRCS_MIN+= Target/ARM/ARMRegisterInfo.cpp
	SRCS_MIN+= Target/ARM/ARMSelectionDAGInfo.cpp
	SRCS_MIN+= Target/ARM/ARMSubtarget.cpp
	SRCS_MIN+= Target/ARM/ARMTargetMachine.cpp
	SRCS_MIN+= Target/ARM/ARMTargetObjectFile.cpp
	SRCS_MIN+= Target/ARM/ARMTargetTransformInfo.cpp
	SRCS_MIN+= Target/ARM/AsmParser/ARMAsmParser.cpp
	SRCS_MIN+= Target/ARM/Disassembler/ARMDisassembler.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCExpr.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
	SRCS_MIN+= Target/ARM/MLxExpansionPass.cpp
	SRCS_MIN+= Target/ARM/MVEGatherScatterLowering.cpp
	SRCS_MIN+= Target/ARM/MVETailPredication.cpp
	SRCS_MIN+= Target/ARM/MVEVPTBlockPass.cpp
	SRCS_MIN+= Target/ARM/TargetInfo/ARMTargetInfo.cpp
	SRCS_MIN+= Target/ARM/Thumb1FrameLowering.cpp
	SRCS_MIN+= Target/ARM/Thumb1InstrInfo.cpp
	SRCS_MIN+= Target/ARM/Thumb2ITBlockPass.cpp
	SRCS_MIN+= Target/ARM/Thumb2InstrInfo.cpp
	SRCS_MIN+= Target/ARM/Thumb2SizeReduction.cpp
	SRCS_MIN+= Target/ARM/ThumbRegisterInfo.cpp
	SRCS_MIN+= Target/ARM/Utils/ARMBaseInfo.cpp
	.endif # MK_LLVM_TARGET_ARM
	.if ${MK_LLVM_TARGET_BPF} != "no"
	SRCS_MIN+= Target/BPF/AsmParser/BPFAsmParser.cpp
	SRCS_MIN+= Target/BPF/BPFAbstractMemberAccess.cpp
	SRCS_MIN+= Target/BPF/BPFAsmPrinter.cpp
	SRCS_MIN+= Target/BPF/BPFFrameLowering.cpp
	SRCS_MIN+= Target/BPF/BPFISelDAGToDAG.cpp
	SRCS_MIN+= Target/BPF/BPFISelLowering.cpp
	SRCS_MIN+= Target/BPF/BPFInstrInfo.cpp
	SRCS_MIN+= Target/BPF/BPFMCInstLower.cpp
	SRCS_MIN+= Target/BPF/BPFMIChecking.cpp
	SRCS_MIN+= Target/BPF/BPFMIPeephole.cpp
	SRCS_MIN+= Target/BPF/BPFMISimplifyPatchable.cpp
	SRCS_MIN+= Target/BPF/BPFRegisterInfo.cpp
	SRCS_MIN+= Target/BPF/BPFSelectionDAGInfo.cpp
	SRCS_MIN+= Target/BPF/BPFSubtarget.cpp
	SRCS_MIN+= Target/BPF/BPFTargetMachine.cpp
	SRCS_MIN+= Target/BPF/BTFDebug.cpp
	SRCS_MIN+= Target/BPF/Disassembler/BPFDisassembler.cpp
	SRCS_MIN+= Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
	SRCS_MIN+= Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
	SRCS_MIN+= Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
	SRCS_MIN+= Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
	SRCS_MIN+= Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
	SRCS_MIN+= Target/BPF/TargetInfo/BPFTargetInfo.cpp
	.endif # MK_LLVM_TARGET_BPF
	.if ${MK_LLVM_TARGET_MIPS} != "no"
	SRCS_MIN+= Target/Mips/AsmParser/MipsAsmParser.cpp
	SRCS_XDW+= Target/Mips/Disassembler/MipsDisassembler.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsABIInfo.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCExpr.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
	SRCS_MIN+= Target/Mips/MicroMipsSizeReduction.cpp
	SRCS_MIN+= Target/Mips/Mips16FrameLowering.cpp
	SRCS_MIN+= Target/Mips/Mips16HardFloat.cpp
	SRCS_MIN+= Target/Mips/Mips16HardFloatInfo.cpp
	SRCS_MIN+= Target/Mips/Mips16ISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/Mips16ISelLowering.cpp
	SRCS_MIN+= Target/Mips/Mips16InstrInfo.cpp
	SRCS_MIN+= Target/Mips/Mips16RegisterInfo.cpp
	SRCS_MIN+= Target/Mips/MipsAnalyzeImmediate.cpp
	SRCS_MIN+= Target/Mips/MipsAsmPrinter.cpp
	SRCS_MIN+= Target/Mips/MipsBranchExpansion.cpp
	SRCS_MIN+= Target/Mips/MipsCCState.cpp
	SRCS_MIN+= Target/Mips/MipsCallLowering.cpp
	SRCS_MIN+= Target/Mips/MipsConstantIslandPass.cpp
	SRCS_MIN+= Target/Mips/MipsDelaySlotFiller.cpp
	SRCS_MIN+= Target/Mips/MipsExpandPseudo.cpp
	SRCS_MIN+= Target/Mips/MipsFastISel.cpp
	SRCS_MIN+= Target/Mips/MipsFrameLowering.cpp
	SRCS_MIN+= Target/Mips/MipsISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/MipsISelLowering.cpp
	SRCS_MIN+= Target/Mips/MipsInstrInfo.cpp
	SRCS_MIN+= Target/Mips/MipsInstructionSelector.cpp
	SRCS_MIN+= Target/Mips/MipsLegalizerInfo.cpp
	SRCS_MIN+= Target/Mips/MipsMCInstLower.cpp
	SRCS_MIN+= Target/Mips/MipsMachineFunction.cpp
	SRCS_MIN+= Target/Mips/MipsModuleISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/MipsOptimizePICCall.cpp
	SRCS_MIN+= Target/Mips/MipsOs16.cpp
	SRCS_MIN+= Target/Mips/MipsPreLegalizerCombiner.cpp
	SRCS_MIN+= Target/Mips/MipsRegisterBankInfo.cpp
	SRCS_MIN+= Target/Mips/MipsRegisterInfo.cpp
	SRCS_MIN+= Target/Mips/MipsSEFrameLowering.cpp
	SRCS_MIN+= Target/Mips/MipsSEISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/MipsSEISelLowering.cpp
	SRCS_MIN+= Target/Mips/MipsSEInstrInfo.cpp
	SRCS_MIN+= Target/Mips/MipsSERegisterInfo.cpp
	SRCS_MIN+= Target/Mips/MipsSubtarget.cpp
	SRCS_MIN+= Target/Mips/MipsTargetMachine.cpp
	SRCS_MIN+= Target/Mips/MipsTargetObjectFile.cpp
	SRCS_MIN+= Target/Mips/TargetInfo/MipsTargetInfo.cpp
	.endif # MK_LLVM_TARGET_MIPS
	.if ${MK_LLVM_TARGET_POWERPC} != "no"
	SRCS_MIN+= Target/PowerPC/AsmParser/PPCAsmParser.cpp
	SRCS_MIN+= Target/PowerPC/Disassembler/PPCDisassembler.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
	SRCS_MIN+= Target/PowerPC/PPCAsmPrinter.cpp
	SRCS_MIN+= Target/PowerPC/PPCBoolRetToInt.cpp
	SRCS_MIN+= Target/PowerPC/PPCBranchCoalescing.cpp
	SRCS_MIN+= Target/PowerPC/PPCBranchSelector.cpp
	SRCS_MIN+= Target/PowerPC/PPCCCState.cpp
	SRCS_MIN+= Target/PowerPC/PPCCTRLoops.cpp
	SRCS_MIN+= Target/PowerPC/PPCCallingConv.cpp
	SRCS_MIN+= Target/PowerPC/PPCEarlyReturn.cpp
	SRCS_MIN+= Target/PowerPC/PPCExpandISEL.cpp
	SRCS_MIN+= Target/PowerPC/PPCFastISel.cpp
	SRCS_MIN+= Target/PowerPC/PPCFrameLowering.cpp
	SRCS_MIN+= Target/PowerPC/PPCHazardRecognizers.cpp
	SRCS_MIN+= Target/PowerPC/PPCISelDAGToDAG.cpp
	SRCS_MIN+= Target/PowerPC/PPCISelLowering.cpp
	SRCS_MIN+= Target/PowerPC/PPCInstrInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCLoopInstrFormPrep.cpp
	SRCS_MIN+= Target/PowerPC/PPCLowerMASSVEntries.cpp
	SRCS_MIN+= Target/PowerPC/PPCMCInstLower.cpp
	SRCS_MIN+= Target/PowerPC/PPCMIPeephole.cpp
	SRCS_MIN+= Target/PowerPC/PPCMachineFunctionInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCMachineScheduler.cpp
	SRCS_MIN+= Target/PowerPC/PPCPreEmitPeephole.cpp
	SRCS_MIN+= Target/PowerPC/PPCQPXLoadSplat.cpp
	SRCS_MIN+= Target/PowerPC/PPCReduceCRLogicals.cpp
	SRCS_MIN+= Target/PowerPC/PPCRegisterInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCSubtarget.cpp
	SRCS_MIN+= Target/PowerPC/PPCTLSDynamicCall.cpp
	SRCS_MIN+= Target/PowerPC/PPCTOCRegDeps.cpp
	SRCS_MIN+= Target/PowerPC/PPCTargetMachine.cpp
	SRCS_MIN+= Target/PowerPC/PPCTargetObjectFile.cpp
	SRCS_MIN+= Target/PowerPC/PPCTargetTransformInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCVSXCopy.cpp
	SRCS_MIN+= Target/PowerPC/PPCVSXFMAMutate.cpp
	SRCS_MIN+= Target/PowerPC/PPCVSXSwapRemoval.cpp
	SRCS_MIN+= Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
	.endif # MK_LLVM_TARGET_POWERPC
	.if ${MK_LLVM_TARGET_RISCV} != "no"
	SRCS_MIN+= Target/RISCV/AsmParser/RISCVAsmParser.cpp
	SRCS_MIN+= Target/RISCV/Disassembler/RISCVDisassembler.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
	SRCS_MIN+= Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
	SRCS_MIN+= Target/RISCV/RISCVAsmPrinter.cpp
	SRCS_MIN+= Target/RISCV/RISCVCallLowering.cpp
	SRCS_MIN+= Target/RISCV/RISCVExpandPseudoInsts.cpp
	SRCS_MIN+= Target/RISCV/RISCVFrameLowering.cpp
	SRCS_MIN+= Target/RISCV/RISCVInstrInfo.cpp
	SRCS_MIN+= Target/RISCV/RISCVInstructionSelector.cpp
	SRCS_MIN+= Target/RISCV/RISCVISelDAGToDAG.cpp
	SRCS_MIN+= Target/RISCV/RISCVISelLowering.cpp
	SRCS_MIN+= Target/RISCV/RISCVLegalizerInfo.cpp
	SRCS_MIN+= Target/RISCV/RISCVMCInstLower.cpp
	SRCS_MIN+= Target/RISCV/RISCVMergeBaseOffset.cpp
	SRCS_MIN+= Target/RISCV/RISCVRegisterBankInfo.cpp
	SRCS_MIN+= Target/RISCV/RISCVRegisterInfo.cpp
	SRCS_MIN+= Target/RISCV/RISCVSubtarget.cpp
	SRCS_MIN+= Target/RISCV/RISCVTargetMachine.cpp
	SRCS_MIN+= Target/RISCV/RISCVTargetObjectFile.cpp
	SRCS_MIN+= Target/RISCV/RISCVTargetTransformInfo.cpp
	SRCS_MIN+= Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
	SRCS_MIN+= Target/RISCV/Utils/RISCVBaseInfo.cpp
	SRCS_MIN+= Target/RISCV/Utils/RISCVMatInt.cpp
	.endif # MK_LLVM_TARGET_RISCV
	SRCS_MIN+= Target/Target.cpp
	SRCS_MIN+= Target/TargetLoweringObjectFile.cpp
	SRCS_MIN+= Target/TargetMachine.cpp
	SRCS_MIN+= Target/TargetMachineC.cpp
	.if ${MK_LLVM_TARGET_X86} != "no"
	SRCS_MIN+= Target/X86/AsmParser/X86AsmParser.cpp
	SRCS_XDW+= Target/X86/Disassembler/X86Disassembler.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86AsmBackend.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86InstComments.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
	SRCS_MIN+= Target/X86/TargetInfo/X86TargetInfo.cpp
	SRCS_MIN+= Target/X86/Utils/X86ShuffleDecode.cpp
	SRCS_MIN+= Target/X86/X86AsmPrinter.cpp
	SRCS_MIN+= Target/X86/X86AvoidStoreForwardingBlocks.cpp
	SRCS_MIN+= Target/X86/X86AvoidTrailingCall.cpp
	SRCS_MIN+= Target/X86/X86CallFrameOptimization.cpp
	SRCS_MIN+= Target/X86/X86CallLowering.cpp
	SRCS_MIN+= Target/X86/X86CallingConv.cpp
	SRCS_MIN+= Target/X86/X86CmovConversion.cpp
	SRCS_MIN+= Target/X86/X86CondBrFolding.cpp
	SRCS_MIN+= Target/X86/X86DiscriminateMemOps.cpp
	SRCS_MIN+= Target/X86/X86DomainReassignment.cpp
	SRCS_MIN+= Target/X86/X86EvexToVex.cpp
	SRCS_MIN+= Target/X86/X86ExpandPseudo.cpp
	SRCS_MIN+= Target/X86/X86FastISel.cpp
	SRCS_MIN+= Target/X86/X86FixupBWInsts.cpp
	SRCS_MIN+= Target/X86/X86FixupLEAs.cpp
	SRCS_MIN+= Target/X86/X86FixupSetCC.cpp
	SRCS_MIN+= Target/X86/X86FlagsCopyLowering.cpp
	SRCS_MIN+= Target/X86/X86FloatingPoint.cpp
	SRCS_MIN+= Target/X86/X86FrameLowering.cpp
	SRCS_MIN+= Target/X86/X86ISelDAGToDAG.cpp
	SRCS_MIN+= Target/X86/X86ISelLowering.cpp
	SRCS_MIN+= Target/X86/X86IndirectBranchTracking.cpp
	+SRCS_MIN+= Target/X86/X86IndirectThunks.cpp
	SRCS_MIN+= Target/X86/X86InsertPrefetch.cpp
	SRCS_MIN+= Target/X86/X86InstrFMA3Info.cpp
	SRCS_MIN+= Target/X86/X86InstrFoldTables.cpp
	SRCS_MIN+= Target/X86/X86InstrInfo.cpp
	SRCS_MIN+= Target/X86/X86InstructionSelector.cpp
	SRCS_MIN+= Target/X86/X86InterleavedAccess.cpp
	SRCS_MIN+= Target/X86/X86LegalizerInfo.cpp
	+SRCS_MIN+= Target/X86/X86LoadValueInjectionLoadHardening.cpp
	+SRCS_MIN+= Target/X86/X86LoadValueInjectionRetHardening.cpp
	SRCS_MIN+= Target/X86/X86MCInstLower.cpp
	SRCS_MIN+= Target/X86/X86MachineFunctionInfo.cpp
	SRCS_MIN+= Target/X86/X86MacroFusion.cpp
	SRCS_MIN+= Target/X86/X86OptimizeLEAs.cpp
	SRCS_MIN+= Target/X86/X86PadShortFunction.cpp
	SRCS_MIN+= Target/X86/X86RegisterBankInfo.cpp
	SRCS_MIN+= Target/X86/X86RegisterInfo.cpp
	-SRCS_MIN+= Target/X86/X86RetpolineThunks.cpp
	SRCS_MIN+= Target/X86/X86SelectionDAGInfo.cpp
	SRCS_MIN+= Target/X86/X86ShuffleDecodeConstantPool.cpp
	SRCS_MIN+= Target/X86/X86SpeculativeLoadHardening.cpp
	SRCS_MIN+= Target/X86/X86Subtarget.cpp
	SRCS_MIN+= Target/X86/X86TargetMachine.cpp
	SRCS_MIN+= Target/X86/X86TargetObjectFile.cpp
	SRCS_MIN+= Target/X86/X86TargetTransformInfo.cpp
	SRCS_MIN+= Target/X86/X86VZeroUpper.cpp
	SRCS_MIN+= Target/X86/X86WinAllocaExpander.cpp
	SRCS_MIN+= Target/X86/X86WinEHState.cpp
	.endif # MK_LLVM_TARGET_X86
	SRCS_MIW+= TextAPI/MachO/Architecture.cpp
	SRCS_MIW+= TextAPI/MachO/ArchitectureSet.cpp
	SRCS_MIW+= TextAPI/MachO/InterfaceFile.cpp
	SRCS_MIW+= TextAPI/MachO/PackedVersion.cpp
	SRCS_MIW+= TextAPI/MachO/Platform.cpp
	SRCS_MIW+= TextAPI/MachO/Target.cpp
	SRCS_MIW+= TextAPI/MachO/TextStub.cpp
	SRCS_MIW+= TextAPI/MachO/TextStubCommon.cpp
	SRCS_MIN+= ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
	SRCS_MIW+= ToolDrivers/llvm-lib/LibDriver.cpp
	SRCS_MIN+= Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
	SRCS_MIN+= Transforms/AggressiveInstCombine/TruncInstCombine.cpp
	SRCS_MIN+= Transforms/CFGuard/CFGuard.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroCleanup.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroEarly.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroElide.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroFrame.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroSplit.cpp
	SRCS_MIN+= Transforms/Coroutines/Coroutines.cpp
	SRCS_MIN+= Transforms/IPO/AlwaysInliner.cpp
	SRCS_MIN+= Transforms/IPO/ArgumentPromotion.cpp
	SRCS_MIN+= Transforms/IPO/Attributor.cpp
	SRCS_MIN+= Transforms/IPO/BarrierNoopPass.cpp
	SRCS_EXT+= Transforms/IPO/BlockExtractor.cpp
	SRCS_MIN+= Transforms/IPO/CalledValuePropagation.cpp
	SRCS_MIN+= Transforms/IPO/ConstantMerge.cpp
	SRCS_MIN+= Transforms/IPO/CrossDSOCFI.cpp
	SRCS_MIN+= Transforms/IPO/DeadArgumentElimination.cpp
	SRCS_MIN+= Transforms/IPO/ElimAvailExtern.cpp
	SRCS_MIN+= Transforms/IPO/ExtractGV.cpp
	SRCS_MIN+= Transforms/IPO/ForceFunctionAttrs.cpp
	SRCS_MIN+= Transforms/IPO/FunctionAttrs.cpp
	SRCS_MIN+= Transforms/IPO/FunctionImport.cpp
	SRCS_MIN+= Transforms/IPO/GlobalDCE.cpp
	SRCS_MIN+= Transforms/IPO/GlobalOpt.cpp
	SRCS_MIN+= Transforms/IPO/GlobalSplit.cpp
	SRCS_MIN+= Transforms/IPO/HotColdSplitting.cpp
	SRCS_MIN+= Transforms/IPO/IPConstantPropagation.cpp
	SRCS_EXT+= Transforms/IPO/IPO.cpp
	SRCS_MIN+= Transforms/IPO/InferFunctionAttrs.cpp
	SRCS_MIN+= Transforms/IPO/InlineSimple.cpp
	SRCS_MIN+= Transforms/IPO/Inliner.cpp
	SRCS_MIN+= Transforms/IPO/Internalize.cpp
	SRCS_MIN+= Transforms/IPO/LoopExtractor.cpp
	SRCS_MIN+= Transforms/IPO/LowerTypeTests.cpp
	SRCS_MIN+= Transforms/IPO/MergeFunctions.cpp
	SRCS_MIN+= Transforms/IPO/PartialInlining.cpp
	SRCS_MIN+= Transforms/IPO/PassManagerBuilder.cpp
	SRCS_MIN+= Transforms/IPO/PruneEH.cpp
	SRCS_MIN+= Transforms/IPO/SCCP.cpp
	SRCS_MIN+= Transforms/IPO/SampleProfile.cpp
	SRCS_MIN+= Transforms/IPO/StripDeadPrototypes.cpp
	SRCS_MIN+= Transforms/IPO/StripSymbols.cpp
	SRCS_MIN+= Transforms/IPO/SyntheticCountsPropagation.cpp
	SRCS_MIN+= Transforms/IPO/ThinLTOBitcodeWriter.cpp
	SRCS_MIN+= Transforms/IPO/WholeProgramDevirt.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineAddSub.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineAndOrXor.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineAtomicRMW.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineCalls.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineCasts.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineCompares.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineMulDivRem.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombinePHI.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineSelect.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineShifts.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineVectorOps.cpp
	SRCS_MIN+= Transforms/InstCombine/InstructionCombining.cpp
	SRCS_MIN+= Transforms/Instrumentation/AddressSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/BoundsChecking.cpp
	SRCS_MIN+= Transforms/Instrumentation/CGProfile.cpp
	SRCS_MIN+= Transforms/Instrumentation/ControlHeightReduction.cpp
	SRCS_MIN+= Transforms/Instrumentation/DataFlowSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/GCOVProfiling.cpp
	SRCS_MIN+= Transforms/Instrumentation/HWAddressSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/IndirectCallPromotion.cpp
	SRCS_MIN+= Transforms/Instrumentation/InstrOrderFile.cpp
	SRCS_MIN+= Transforms/Instrumentation/InstrProfiling.cpp
	SRCS_MIN+= Transforms/Instrumentation/Instrumentation.cpp
	SRCS_MIN+= Transforms/Instrumentation/MemorySanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/PGOInstrumentation.cpp
	SRCS_MIN+= Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
	SRCS_MIN+= Transforms/Instrumentation/PoisonChecking.cpp
	SRCS_MIN+= Transforms/Instrumentation/SanitizerCoverage.cpp
	SRCS_MIN+= Transforms/Instrumentation/ThreadSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/ValueProfileCollector.cpp
	SRCS_MIN+= Transforms/ObjCARC/DependencyAnalysis.cpp
	SRCS_EXT+= Transforms/ObjCARC/ObjCARC.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCAPElim.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCContract.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCExpand.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCOpts.cpp
	SRCS_MIN+= Transforms/ObjCARC/ProvenanceAnalysis.cpp
	SRCS_MIN+= Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
	SRCS_MIN+= Transforms/ObjCARC/PtrState.cpp
	SRCS_MIN+= Transforms/Scalar/ADCE.cpp
	SRCS_MIN+= Transforms/Scalar/AlignmentFromAssumptions.cpp
	SRCS_MIN+= Transforms/Scalar/BDCE.cpp
	SRCS_MIN+= Transforms/Scalar/CallSiteSplitting.cpp
	SRCS_MIN+= Transforms/Scalar/ConstantHoisting.cpp
	SRCS_MIN+= Transforms/Scalar/ConstantProp.cpp
	SRCS_MIN+= Transforms/Scalar/CorrelatedValuePropagation.cpp
	SRCS_MIN+= Transforms/Scalar/DCE.cpp
	SRCS_MIN+= Transforms/Scalar/DeadStoreElimination.cpp
	SRCS_MIN+= Transforms/Scalar/DivRemPairs.cpp
	SRCS_MIN+= Transforms/Scalar/EarlyCSE.cpp
	SRCS_MIN+= Transforms/Scalar/FlattenCFGPass.cpp
	SRCS_MIN+= Transforms/Scalar/Float2Int.cpp
	SRCS_MIN+= Transforms/Scalar/GVN.cpp
	SRCS_MIN+= Transforms/Scalar/GVNHoist.cpp
	SRCS_MIN+= Transforms/Scalar/GVNSink.cpp
	SRCS_MIN+= Transforms/Scalar/GuardWidening.cpp
	SRCS_MIN+= Transforms/Scalar/IVUsersPrinter.cpp
	SRCS_MIN+= Transforms/Scalar/IndVarSimplify.cpp
	SRCS_MIN+= Transforms/Scalar/InductiveRangeCheckElimination.cpp
	SRCS_EXT+= Transforms/Scalar/InferAddressSpaces.cpp
	SRCS_MIN+= Transforms/Scalar/InstSimplifyPass.cpp
	SRCS_MIN+= Transforms/Scalar/JumpThreading.cpp
	SRCS_MIN+= Transforms/Scalar/LICM.cpp
	SRCS_MIN+= Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
	SRCS_MIN+= Transforms/Scalar/LoopDataPrefetch.cpp
	SRCS_MIN+= Transforms/Scalar/LoopDeletion.cpp
	SRCS_MIN+= Transforms/Scalar/LoopDistribute.cpp
	SRCS_MIN+= Transforms/Scalar/LoopFuse.cpp
	SRCS_MIN+= Transforms/Scalar/LoopIdiomRecognize.cpp
	SRCS_MIN+= Transforms/Scalar/LoopInstSimplify.cpp
	SRCS_MIN+= Transforms/Scalar/LoopInterchange.cpp
	SRCS_MIN+= Transforms/Scalar/LoopLoadElimination.cpp
	SRCS_MIN+= Transforms/Scalar/LoopPassManager.cpp
	SRCS_MIN+= Transforms/Scalar/LoopPredication.cpp
	SRCS_MIN+= Transforms/Scalar/LoopRerollPass.cpp
	SRCS_MIN+= Transforms/Scalar/LoopRotation.cpp
	SRCS_MIN+= Transforms/Scalar/LoopSimplifyCFG.cpp
	SRCS_MIN+= Transforms/Scalar/LoopSink.cpp
	SRCS_MIN+= Transforms/Scalar/LoopStrengthReduce.cpp
	SRCS_MIN+= Transforms/Scalar/LoopUnrollPass.cpp
	SRCS_MIN+= Transforms/Scalar/LoopUnrollAndJamPass.cpp
	SRCS_MIN+= Transforms/Scalar/LoopUnswitch.cpp
	SRCS_MIN+= Transforms/Scalar/LoopVersioningLICM.cpp
	SRCS_MIN+= Transforms/Scalar/LowerAtomic.cpp
	SRCS_MIN+= Transforms/Scalar/LowerConstantIntrinsics.cpp
	SRCS_MIN+= Transforms/Scalar/LowerExpectIntrinsic.cpp
	SRCS_MIN+= Transforms/Scalar/LowerGuardIntrinsic.cpp
	SRCS_MIN+= Transforms/Scalar/LowerMatrixIntrinsics.cpp
	SRCS_MIN+= Transforms/Scalar/LowerWidenableCondition.cpp
	SRCS_MIN+= Transforms/Scalar/MakeGuardsExplicit.cpp
	SRCS_MIN+= Transforms/Scalar/MemCpyOptimizer.cpp
	SRCS_MIN+= Transforms/Scalar/MergeICmps.cpp
	SRCS_MIN+= Transforms/Scalar/MergedLoadStoreMotion.cpp
	SRCS_MIN+= Transforms/Scalar/NaryReassociate.cpp
	SRCS_MIN+= Transforms/Scalar/NewGVN.cpp
	SRCS_MIN+= Transforms/Scalar/PartiallyInlineLibCalls.cpp
	SRCS_MIN+= Transforms/Scalar/PlaceSafepoints.cpp
	SRCS_MIN+= Transforms/Scalar/Reassociate.cpp
	SRCS_MIN+= Transforms/Scalar/Reg2Mem.cpp
	SRCS_MIN+= Transforms/Scalar/RewriteStatepointsForGC.cpp
	SRCS_MIN+= Transforms/Scalar/SCCP.cpp
	SRCS_MIN+= Transforms/Scalar/SROA.cpp
	SRCS_EXT+= Transforms/Scalar/Scalar.cpp
	SRCS_MIN+= Transforms/Scalar/Scalarizer.cpp
	SRCS_MIN+= Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
	SRCS_MIN+= Transforms/Scalar/SimpleLoopUnswitch.cpp
	SRCS_MIN+= Transforms/Scalar/SimplifyCFGPass.cpp
	SRCS_MIN+= Transforms/Scalar/Sink.cpp
	SRCS_MIN+= Transforms/Scalar/SpeculateAroundPHIs.cpp
	SRCS_MIN+= Transforms/Scalar/SpeculativeExecution.cpp
	SRCS_MIN+= Transforms/Scalar/StraightLineStrengthReduce.cpp
	SRCS_MIN+= Transforms/Scalar/StructurizeCFG.cpp
	SRCS_MIN+= Transforms/Scalar/TailRecursionElimination.cpp
	SRCS_MIN+= Transforms/Scalar/WarnMissedTransforms.cpp
	SRCS_MIN+= Transforms/Utils/ASanStackFrameLayout.cpp
	SRCS_MIN+= Transforms/Utils/AddDiscriminators.cpp
	SRCS_MIN+= Transforms/Utils/BasicBlockUtils.cpp
	SRCS_MIN+= Transforms/Utils/BreakCriticalEdges.cpp
	SRCS_MIN+= Transforms/Utils/BuildLibCalls.cpp
	SRCS_MIN+= Transforms/Utils/BypassSlowDivision.cpp
	SRCS_MIN+= Transforms/Utils/CallPromotionUtils.cpp
	SRCS_MIN+= Transforms/Utils/CanonicalizeAliases.cpp
	SRCS_MIN+= Transforms/Utils/CloneFunction.cpp
	SRCS_MIN+= Transforms/Utils/CloneModule.cpp
	SRCS_MIN+= Transforms/Utils/CodeExtractor.cpp
	SRCS_MIN+= Transforms/Utils/CodeMoverUtils.cpp
	SRCS_MIN+= Transforms/Utils/CtorUtils.cpp
	SRCS_EXT+= Transforms/Utils/Debugify.cpp
	SRCS_MIN+= Transforms/Utils/DemoteRegToStack.cpp
	SRCS_MIN+= Transforms/Utils/EntryExitInstrumenter.cpp
	SRCS_MIN+= Transforms/Utils/EscapeEnumerator.cpp
	SRCS_MIN+= Transforms/Utils/Evaluator.cpp
	SRCS_MIN+= Transforms/Utils/FlattenCFG.cpp
	SRCS_MIN+= Transforms/Utils/FunctionComparator.cpp
	SRCS_MIN+= Transforms/Utils/FunctionImportUtils.cpp
	SRCS_MIN+= Transforms/Utils/GlobalStatus.cpp
	SRCS_MIN+= Transforms/Utils/GuardUtils.cpp
	SRCS_MIN+= Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
	SRCS_MIN+= Transforms/Utils/InjectTLIMappings.cpp
	SRCS_MIN+= Transforms/Utils/InlineFunction.cpp
	SRCS_MIN+= Transforms/Utils/InstructionNamer.cpp
	SRCS_MIN+= Transforms/Utils/IntegerDivision.cpp
	SRCS_MIN+= Transforms/Utils/LCSSA.cpp
	SRCS_MIN+= Transforms/Utils/LibCallsShrinkWrap.cpp
	SRCS_MIN+= Transforms/Utils/Local.cpp
	SRCS_MIN+= Transforms/Utils/LoopSimplify.cpp
	SRCS_MIN+= Transforms/Utils/LoopRotationUtils.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnroll.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnrollAndJam.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnrollPeel.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnrollRuntime.cpp
	SRCS_MIN+= Transforms/Utils/LoopUtils.cpp
	SRCS_MIN+= Transforms/Utils/LoopVersioning.cpp
	SRCS_MIN+= Transforms/Utils/LowerInvoke.cpp
	SRCS_MIN+= Transforms/Utils/LowerSwitch.cpp
	SRCS_MIN+= Transforms/Utils/Mem2Reg.cpp
	SRCS_MIN+= Transforms/Utils/MetaRenamer.cpp
	SRCS_MIN+= Transforms/Utils/MisExpect.cpp
	SRCS_MIN+= Transforms/Utils/ModuleUtils.cpp
	SRCS_MIN+= Transforms/Utils/NameAnonGlobals.cpp
	SRCS_MIN+= Transforms/Utils/PredicateInfo.cpp
	SRCS_MIN+= Transforms/Utils/PromoteMemoryToRegister.cpp
	SRCS_MIN+= Transforms/Utils/SSAUpdater.cpp
	SRCS_MIN+= Transforms/Utils/SanitizerStats.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyCFG.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyIndVar.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyLibCalls.cpp
	SRCS_MIN+= Transforms/Utils/SizeOpts.cpp
	SRCS_MIN+= Transforms/Utils/SplitModule.cpp
	SRCS_MIN+= Transforms/Utils/StripGCRelocates.cpp
	SRCS_MIN+= Transforms/Utils/StripNonLineTableDebugInfo.cpp
	SRCS_MIN+= Transforms/Utils/SymbolRewriter.cpp
	SRCS_MIN+= Transforms/Utils/UnifyFunctionExitNodes.cpp
	SRCS_EXT+= Transforms/Utils/Utils.cpp
	SRCS_MIN+= Transforms/Utils/VNCoercion.cpp
	SRCS_MIN+= Transforms/Utils/ValueMapper.cpp
	SRCS_MIN+= Transforms/Vectorize/LoadStoreVectorizer.cpp
	SRCS_MIN+= Transforms/Vectorize/LoopVectorizationLegality.cpp
	SRCS_MIN+= Transforms/Vectorize/LoopVectorize.cpp
	SRCS_MIN+= Transforms/Vectorize/SLPVectorizer.cpp
	SRCS_MIN+= Transforms/Vectorize/VPlan.cpp
	SRCS_MIN+= Transforms/Vectorize/VPlanHCFGBuilder.cpp
	SRCS_MIN+= Transforms/Vectorize/VPlanPredicator.cpp
	SRCS_MIN+= Transforms/Vectorize/VPlanTransforms.cpp
	SRCS_MIN+= Transforms/Vectorize/VPlanVerifier.cpp
	SRCS_EXT+= Transforms/Vectorize/Vectorize.cpp
	SRCS_EXT+= XRay/BlockIndexer.cpp
	SRCS_EXT+= XRay/BlockVerifier.cpp
	SRCS_EXT+= XRay/FDRRecordProducer.cpp
	SRCS_EXT+= XRay/FDRRecords.cpp
	SRCS_EXT+= XRay/FDRTraceExpander.cpp
	SRCS_EXT+= XRay/FileHeaderReader.cpp
	SRCS_EXT+= XRay/InstrumentationMap.cpp
	SRCS_EXT+= XRay/LogBuilderConsumer.cpp
	SRCS_EXT+= XRay/RecordInitializer.cpp
	SRCS_EXT+= XRay/Trace.cpp

	SRCS_ALL+= ${SRCS_MIN}
	.if !defined(TOOLS_PREFIX) \|\| ${MK_LLD_BOOTSTRAP} != "no"
	SRCS_ALL+= ${SRCS_MIW}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no"
	SRCS_ALL+= ${SRCS_EXT}
	.endif
	.if ${MK_CLANG_FULL} != "no"
	SRCS_ALL+= ${SRCS_FUL}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLD} != "no" \|\| \
	(defined(TOOLS_PREFIX) && ${MK_LLD_BOOTSTRAP} != "no")
	SRCS_ALL+= ${SRCS_EXL}
	.endif
	.if ${MK_LLD} != "no" \|\| \
	(defined(TOOLS_PREFIX) && ${MK_LLD_BOOTSTRAP} != "no")
	SRCS_ALL+= ${SRCS_LLD}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLDB} != "no"
	SRCS_ALL+= ${SRCS_XDB}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLDB} != "no" \|\| ${MK_LLD} != "no" \|\| \
	(defined(TOOLS_PREFIX) && ${MK_LLD_BOOTSTRAP} != "no")
	SRCS_ALL+= ${SRCS_XDL}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLDB} != "no" \|\| !defined(TOOLS_PREFIX)
	SRCS_ALL+= ${SRCS_XDW}
	.endif
	SRCS+= ${SRCS_ALL:O}

	llvm/IR/Attributes.inc: ${LLVM_SRCS}/include/llvm/IR/Attributes.td
	${LLVM_TBLGEN} -gen-attrs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/include/llvm/IR/Attributes.td
	TGHDRS+= llvm/IR/Attributes.inc

	llvm/IR/IntrinsicEnums.inc: ${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	${LLVM_TBLGEN} -gen-intrinsic-enums \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	TGHDRS+= llvm/IR/IntrinsicEnums.inc

	llvm/IR/IntrinsicImpl.inc: ${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	${LLVM_TBLGEN} -gen-intrinsic-impl \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	TGHDRS+= llvm/IR/IntrinsicImpl.inc

	.for arch in \
	AArch64/aarch64 AMDGPU/amdgcn ARM/arm BPF/bpf Hexagon/hexagon \
	Mips/mips NVPTX/nvvm PowerPC/ppc R600/r600 RISCV/riscv S390/s390 \
	WebAssembly/wasm X86/x86 XCore/xcore
	llvm/IR/Intrinsics${arch:H}.h: ${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	${LLVM_TBLGEN} -gen-intrinsic-enums -intrinsic-prefix=${arch:T} \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	TGHDRS+= llvm/IR/Intrinsics${arch:H}.h
	.endfor

	AttributesCompatFunc.inc: ${LLVM_SRCS}/lib/IR/AttributesCompatFunc.td
	${LLVM_TBLGEN} -gen-attrs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/IR/AttributesCompatFunc.td
	TGHDRS+= AttributesCompatFunc.inc

	InstCombineTables.inc: ${LLVM_SRCS}/lib/Transforms/InstCombine/InstCombineTables.td
	${LLVM_TBLGEN} -gen-searchable-tables \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/Transforms/InstCombine/InstCombineTables.td
	TGHDRS+= InstCombineTables.inc

	llvm-lib/Options.inc: ${LLVM_SRCS}/lib/ToolDrivers/llvm-lib/Options.td
	${LLVM_TBLGEN} -gen-opt-parser-defs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/ToolDrivers/llvm-lib/Options.td
	TGHDRS+= llvm-lib/Options.inc
	CFLAGS.LibDriver.cpp+= -I${.OBJDIR}/llvm-lib

	llvm-dlltool/Options.inc: ${LLVM_SRCS}/lib/ToolDrivers/llvm-dlltool/Options.td
	${LLVM_TBLGEN} -gen-opt-parser-defs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/ToolDrivers/llvm-dlltool/Options.td
	TGHDRS+= llvm-dlltool/Options.inc
	CFLAGS.DlltoolDriver.cpp+= -I${.OBJDIR}/llvm-dlltool

	beforebuild:
	# 20170724 remove stale Options.inc file, of which there are two different
	# versions after upstream r308421, one for llvm-lib, one for llvm-dlltool
	.for f in Options.inc
	.if exists(${f}) \|\| exists(${f}.d)
	@echo Removing stale generated ${f} files
	@rm -f ${f} ${f}.d
	.endif
	.endfor

	# Note: some rules are superfluous, not every combination is valid.
	.for arch in \
	AArch64/AArch64 ARM/ARM BPF/BPF Mips/Mips PowerPC/PPC RISCV/RISCV \
	X86/X86
	. for hdr in \
	AsmMatcher/-gen-asm-matcher \
	AsmWriter1/-gen-asm-writer,-asmwriternum=1 \
	AsmWriter/-gen-asm-writer \
	CallingConv/-gen-callingconv \
	CodeEmitter/-gen-emitter \
	CompressInstEmitter/-gen-compress-inst-emitter \
	DAGISel/-gen-dag-isel \
	DisassemblerTables/-gen-disassembler \
	EVEX2VEXTables/-gen-x86-EVEX2VEX-tables \
	FastISel/-gen-fast-isel \
	GICombiner/-gen-global-isel-combiner,-combiners=${arch:H}PreLegalizerCombinerHelper \
	GlobalISel/-gen-global-isel \
	InstrInfo/-gen-instr-info \
	MCCodeEmitter/-gen-emitter \
	MCPseudoLowering/-gen-pseudo-lowering \
	RegisterBank/-gen-register-bank \
	RegisterInfo/-gen-register-info \
	SubtargetInfo/-gen-subtarget \
	SystemOperands/-gen-searchable-tables \
	SystemRegister/-gen-searchable-tables
	${arch:T}Gen${hdr:H}.inc: ${LLVM_SRCS}/lib/Target/${arch:H}/${arch:T}.td
	${LLVM_TBLGEN} ${hdr:T:C/,/ /g} \
	-I ${LLVM_SRCS}/include -I ${LLVM_SRCS}/lib/Target/${arch:H} \
	-d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/Target/${arch:H}/${arch:T}.td
	. endfor
	.endfor
	.if ${MK_LLVM_TARGET_AARCH64} != "no"
	TGHDRS+= AArch64GenAsmMatcher.inc
	TGHDRS+= AArch64GenAsmWriter.inc
	TGHDRS+= AArch64GenAsmWriter1.inc
	TGHDRS+= AArch64GenCallingConv.inc
	TGHDRS+= AArch64GenDAGISel.inc
	TGHDRS+= AArch64GenDisassemblerTables.inc
	TGHDRS+= AArch64GenFastISel.inc
	TGHDRS+= AArch64GenGICombiner.inc
	TGHDRS+= AArch64GenGlobalISel.inc
	TGHDRS+= AArch64GenInstrInfo.inc
	TGHDRS+= AArch64GenMCCodeEmitter.inc
	TGHDRS+= AArch64GenMCPseudoLowering.inc
	TGHDRS+= AArch64GenRegisterBank.inc
	TGHDRS+= AArch64GenRegisterInfo.inc
	TGHDRS+= AArch64GenSubtargetInfo.inc
	TGHDRS+= AArch64GenSystemOperands.inc
	.endif # MK_LLVM_TARGET_AARCH64
	.if ${MK_LLVM_TARGET_ARM} != "no"
	TGHDRS+= ARMGenAsmMatcher.inc
	TGHDRS+= ARMGenAsmWriter.inc
	TGHDRS+= ARMGenCallingConv.inc
	TGHDRS+= ARMGenDAGISel.inc
	TGHDRS+= ARMGenDisassemblerTables.inc
	TGHDRS+= ARMGenFastISel.inc
	TGHDRS+= ARMGenGlobalISel.inc
	TGHDRS+= ARMGenInstrInfo.inc
	TGHDRS+= ARMGenMCCodeEmitter.inc
	TGHDRS+= ARMGenMCPseudoLowering.inc
	TGHDRS+= ARMGenRegisterBank.inc
	TGHDRS+= ARMGenRegisterInfo.inc
	TGHDRS+= ARMGenSubtargetInfo.inc
	TGHDRS+= ARMGenSystemRegister.inc
	.endif # MK_LLVM_TARGET_ARM
	.if ${MK_LLVM_TARGET_BPF} != "no"
	TGHDRS+= BPFGenAsmMatcher.inc
	TGHDRS+= BPFGenAsmWriter.inc
	TGHDRS+= BPFGenCallingConv.inc
	TGHDRS+= BPFGenDAGISel.inc
	TGHDRS+= BPFGenDisassemblerTables.inc
	TGHDRS+= BPFGenInstrInfo.inc
	TGHDRS+= BPFGenMCCodeEmitter.inc
	TGHDRS+= BPFGenRegisterInfo.inc
	TGHDRS+= BPFGenSubtargetInfo.inc
	.endif # MK_LLVM_TARGET_BPF
	.if ${MK_LLVM_TARGET_MIPS} != "no"
	TGHDRS+= MipsGenAsmMatcher.inc
	TGHDRS+= MipsGenAsmWriter.inc
	TGHDRS+= MipsGenCallingConv.inc
	TGHDRS+= MipsGenDAGISel.inc
	TGHDRS+= MipsGenDisassemblerTables.inc
	TGHDRS+= MipsGenFastISel.inc
	TGHDRS+= MipsGenGlobalISel.inc
	TGHDRS+= MipsGenInstrInfo.inc
	TGHDRS+= MipsGenMCCodeEmitter.inc
	TGHDRS+= MipsGenMCPseudoLowering.inc
	TGHDRS+= MipsGenRegisterBank.inc
	TGHDRS+= MipsGenRegisterInfo.inc
	TGHDRS+= MipsGenSubtargetInfo.inc
	.endif # MK_LLVM_TARGET_MIPS
	.if ${MK_LLVM_TARGET_POWERPC} != "no"
	TGHDRS+= PPCGenAsmMatcher.inc
	TGHDRS+= PPCGenAsmWriter.inc
	TGHDRS+= PPCGenCallingConv.inc
	TGHDRS+= PPCGenDAGISel.inc
	TGHDRS+= PPCGenDisassemblerTables.inc
	TGHDRS+= PPCGenFastISel.inc
	TGHDRS+= PPCGenInstrInfo.inc
	TGHDRS+= PPCGenMCCodeEmitter.inc
	TGHDRS+= PPCGenRegisterInfo.inc
	TGHDRS+= PPCGenSubtargetInfo.inc
	.endif # MK_LLVM_TARGET_POWERPC
	.if ${MK_LLVM_TARGET_RISCV} != "no"
	TGHDRS+= RISCVGenAsmMatcher.inc
	TGHDRS+= RISCVGenAsmWriter.inc
	TGHDRS+= RISCVGenCallingConv.inc
	TGHDRS+= RISCVGenCompressInstEmitter.inc
	TGHDRS+= RISCVGenDAGISel.inc
	TGHDRS+= RISCVGenDisassemblerTables.inc
	TGHDRS+= RISCVGenDAGISel.inc
	TGHDRS+= RISCVGenGlobalISel.inc
	TGHDRS+= RISCVGenInstrInfo.inc
	TGHDRS+= RISCVGenMCCodeEmitter.inc
	TGHDRS+= RISCVGenMCPseudoLowering.inc
	TGHDRS+= RISCVGenRegisterBank.inc
	TGHDRS+= RISCVGenRegisterInfo.inc
	TGHDRS+= RISCVGenSubtargetInfo.inc
	TGHDRS+= RISCVGenSystemOperands.inc
	.endif # MK_LLVM_TARGET_RISCV
	.if ${MK_LLVM_TARGET_X86} != "no"
	TGHDRS+= X86GenAsmMatcher.inc
	TGHDRS+= X86GenAsmWriter.inc
	TGHDRS+= X86GenAsmWriter1.inc
	TGHDRS+= X86GenCallingConv.inc
	TGHDRS+= X86GenDAGISel.inc
	TGHDRS+= X86GenDisassemblerTables.inc
	TGHDRS+= X86GenEVEX2VEXTables.inc
	TGHDRS+= X86GenFastISel.inc
	TGHDRS+= X86GenGlobalISel.inc
	TGHDRS+= X86GenInstrInfo.inc
	TGHDRS+= X86GenRegisterBank.inc
	TGHDRS+= X86GenRegisterInfo.inc
	TGHDRS+= X86GenSubtargetInfo.inc
	.endif # MK_LLVM_TARGET_X86

	DEPENDFILES+= ${TGHDRS:C/$/.d/}
	DPSRCS+= ${TGHDRS}
	CLEANFILES+= ${TGHDRS} ${TGHDRS:C/$/.d/}

	.include "../llvm.build.mk"
	.include <bsd.lib.mk>

File Metadata

Mime Type: application/octet-stream
Expires: Thu, Apr 16, 6:28 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: dou6.lJNQnI7
Default Alt Text: (5 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions